From ff82bf8a0d80bc8cfbb80c4d1259f6807b6d4902 Mon Sep 17 00:00:00 2001 From: Robin Deeboonchai Date: Sat, 6 Jun 2026 04:10:50 -0700 Subject: [PATCH 1/4] refactor: pseudo-shared, starting on machine api mode only --- pkg/cloudprovider/suite_drift_test.go | 380 ++-- pkg/cloudprovider/suite_features_test.go | 2048 ++++++++++--------- pkg/cloudprovider/suite_integration_test.go | 90 +- pkg/cloudprovider/suite_modes_test.go | 81 + pkg/cloudprovider/suite_offerings_test.go | 1518 +++++++------- 5 files changed, 2089 insertions(+), 2028 deletions(-) create mode 100644 pkg/cloudprovider/suite_modes_test.go diff --git a/pkg/cloudprovider/suite_drift_test.go b/pkg/cloudprovider/suite_drift_test.go index fe0b1edc1..cfb17a39c 100644 --- a/pkg/cloudprovider/suite_drift_test.go +++ b/pkg/cloudprovider/suite_drift_test.go @@ -44,243 +44,247 @@ import ( "github.com/Azure/karpenter-provider-azure/pkg/test" ) -var _ = Describe("CloudProvider", func() { - Context("ProvisionMode = AKSMachineAPIHeaderBatch", func() { +func runAKSMachineAPIDriftTests() { + Context("Drift", func() { + var nodeClaim *karpv1.NodeClaim + var node *v1.Node + var createInput *fake.AKSMachineCreateOrUpdateInput + BeforeEach(func() { - testOptions = test.Options(test.OptionsFields{ - ProvisionMode: lo.ToPtr(consts.ProvisionModeAKSMachineAPIHeaderBatch), - UseSIG: lo.ToPtr(true), + instanceType := "Standard_D2_v2" + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + pod := coretest.UnschedulablePod(coretest.PodOptions{ + NodeSelector: map[string]string{v1.LabelInstanceTypeStable: instanceType}, }) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + node = ExpectScheduled(ctx, env.Client, pod) + // KubeletVersion must be applied to the node to satisfy k8s drift + if nodeClass.Status.KubernetesVersion != nil { + node.Status.NodeInfo.KubeletVersion = "v" + *nodeClass.Status.KubernetesVersion + } + node.Labels[v1beta1.AKSLabelKubeletIdentityClientID] = "61f71907-753f-4802-a901-47361c3664f2" // random UUID + + ExpectApplied(ctx, env.Client, node) + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + createInput = azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + + nodeClaims, err := cloudProvider.List(ctx) + Expect(err).ToNot(HaveOccurred()) + Expect(nodeClaims).To(HaveLen(1)) + + nodeClaim = nodeClaims[0] + nodeClaim.Status.NodeName = node.Name // Normally core would do this. + nodeClaim.Spec.NodeClassRef = &karpv1.NodeClassReference{ + Group: object.GVK(nodeClass).Group, + Kind: object.GVK(nodeClass).Kind, + Name: nodeClass.Name, + } + }) - ctx = coreoptions.ToContext(ctx, coretest.Options()) - ctx = options.ToContext(ctx, testOptions) - - azureEnv = test.NewEnvironment(ctx, env) - azureEnvNonZonal = test.NewEnvironmentNonZonal(ctx, env) - statusController = status.NewController(env.Client, azureEnv.KubernetesVersionProvider, azureEnv.ImageProvider, env.KubernetesInterface, env.KubernetesInterface, azureEnv.DynamicInterface, azureEnv.SubnetsAPI, azureEnv.DiskEncryptionSetsAPI, testOptions.ParsedDiskEncryptionSetID, options.FromContext(ctx).NetworkPolicy, options.FromContext(ctx).NetworkPlugin) - test.ApplyDefaultStatus(nodeClass, env, testOptions.UseSIG) - cloudProvider = New(azureEnv.InstanceTypesProvider, azureEnv.VMInstanceProvider, azureEnv.AKSMachineProvider, recorder, env.Client, azureEnv.ImageProvider, azureEnv.InstanceTypeStore) - cloudProviderNonZonal = New(azureEnvNonZonal.InstanceTypesProvider, azureEnvNonZonal.VMInstanceProvider, azureEnvNonZonal.AKSMachineProvider, events.NewRecorder(&record.FakeRecorder{}), env.Client, azureEnvNonZonal.ImageProvider, azureEnvNonZonal.InstanceTypeStore) + It("should not fail if nodeClass does not exist", func() { + ExpectDeleted(ctx, env.Client, nodeClass) + drifted, err := cloudProvider.IsDrifted(ctx, nodeClaim) + Expect(err).ToNot(HaveOccurred()) + Expect(drifted).To(BeEmpty()) + }) - cluster = state.NewCluster(fakeClock, env.Client, cloudProvider) - clusterNonZonal = state.NewCluster(fakeClock, env.Client, cloudProviderNonZonal) - coreProvisioner = provisioning.NewProvisioner(env.Client, recorder, cloudProvider, cluster, fakeClock) - coreProvisionerNonZonal = provisioning.NewProvisioner(env.Client, recorder, cloudProviderNonZonal, clusterNonZonal, fakeClock) + It("should not fail if nodePool does not exist", func() { + ExpectDeleted(ctx, env.Client, nodePool) + drifted, err := cloudProvider.IsDrifted(ctx, nodeClaim) + Expect(err).ToNot(HaveOccurred()) + Expect(drifted).To(BeEmpty()) + }) - ExpectApplied(ctx, env.Client, nodeClass, nodePool) - ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + It("should not return drifted if the NodeClaim is valid", func() { + drifted, err := cloudProvider.IsDrifted(ctx, nodeClaim) + Expect(err).ToNot(HaveOccurred()) + Expect(drifted).To(BeEmpty()) }) - AfterEach(func() { - // Wait for any async polling goroutines to complete before resetting - cloudProvider.WaitForInstancePromises() - cluster.Reset() - azureEnv.Reset(ctx) - azureEnvNonZonal.Reset(ctx) + It("should error drift if NodeClaim doesn't have provider id", func() { + nodeClaim.Status = karpv1.NodeClaimStatus{} + drifted, err := cloudProvider.IsDrifted(ctx, nodeClaim) + Expect(err).To(HaveOccurred()) + Expect(drifted).To(BeEmpty()) }) - Context("Drift", func() { - var nodeClaim *karpv1.NodeClaim - var node *v1.Node - var createInput *fake.AKSMachineCreateOrUpdateInput + Context("Node Image Drift", func() { + It("should trigger drift when DriftAction field is available", func() { + // Find the AKS machine that was created during BeforeEach + aksMachineID := fake.MkMachineID(testOptions.NodeResourceGroup, testOptions.ClusterName, testOptions.AKSMachinesPoolName, createInput.AKSMachineName) - BeforeEach(func() { - instanceType := "Standard_D2_v2" - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod(coretest.PodOptions{ - NodeSelector: map[string]string{v1.LabelInstanceTypeStable: instanceType}, - }) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - node = ExpectScheduled(ctx, env.Client, pod) - // KubeletVersion must be applied to the node to satisfy k8s drift - if nodeClass.Status.KubernetesVersion != nil { - node.Status.NodeInfo.KubeletVersion = "v" + *nodeClass.Status.KubernetesVersion - } - node.Labels[v1beta1.AKSLabelKubeletIdentityClientID] = "61f71907-753f-4802-a901-47361c3664f2" // random UUID + // Get the existing machine from the fake store + existingMachine, ok := azureEnv.AKSDataStorage.AKSMachines.Load(aksMachineID) + Expect(ok).To(BeTrue(), "AKS machine should exist in fake store") - ExpectApplied(ctx, env.Client, node) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput = azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := existingMachine - nodeClaims, err := cloudProvider.List(ctx) - Expect(err).ToNot(HaveOccurred()) - Expect(nodeClaims).To(HaveLen(1)) - - nodeClaim = nodeClaims[0] - nodeClaim.Status.NodeName = node.Name // Normally core would do this. - nodeClaim.Spec.NodeClassRef = &karpv1.NodeClassReference{ - Group: object.GVK(nodeClass).Group, - Kind: object.GVK(nodeClass).Kind, - Name: nodeClass.Name, + // Set DriftAction to "Recreate" to trigger drift + if aksMachine.Properties == nil { + aksMachine.Properties = &armcontainerservice.MachineProperties{} } - }) + if aksMachine.Properties.Status == nil { + aksMachine.Properties.Status = &armcontainerservice.MachineStatus{} + } + aksMachine.Properties.Status.DriftAction = lo.ToPtr(armcontainerservice.DriftActionRecreate) + aksMachine.Properties.Status.DriftReason = lo.ToPtr("ClusterConfigurationChanged") + + // Update the machine in the fake store + azureEnv.AKSDataStorage.AKSMachines.Store(aksMachineID, aksMachine) - It("should not fail if nodeClass does not exist", func() { - ExpectDeleted(ctx, env.Client, nodeClass) drifted, err := cloudProvider.IsDrifted(ctx, nodeClaim) Expect(err).ToNot(HaveOccurred()) - Expect(drifted).To(BeEmpty()) + Expect(drifted).To(Equal(ClusterConfigDrift)) }) + }) - It("should not fail if nodePool does not exist", func() { - ExpectDeleted(ctx, env.Client, nodePool) + Context("Node Image Drift", func() { + It("should succeed with no drift when nothing changes", func() { drifted, err := cloudProvider.IsDrifted(ctx, nodeClaim) Expect(err).ToNot(HaveOccurred()) - Expect(drifted).To(BeEmpty()) + Expect(drifted).To(Equal(NoDrift)) }) - It("should not return drifted if the NodeClaim is valid", func() { + It("should succeed with no drift when ConditionTypeImagesReady is not true", func() { + nodeClass = ExpectExists(ctx, env.Client, nodeClass) + nodeClass.StatusConditions().SetFalse(v1beta1.ConditionTypeImagesReady, "ImagesNoLongerReady", "test when images aren't ready") + ExpectApplied(ctx, env.Client, nodeClass) drifted, err := cloudProvider.IsDrifted(ctx, nodeClaim) Expect(err).ToNot(HaveOccurred()) - Expect(drifted).To(BeEmpty()) + Expect(drifted).To(Equal(NoDrift)) }) - It("should error drift if NodeClaim doesn't have provider id", func() { - nodeClaim.Status = karpv1.NodeClaimStatus{} + // Note: this case shouldn't be able to happen in practice since if Images is empty ConditionTypeImagesReady should be false. + It("should error when Images are empty", func() { + nodeClass = ExpectExists(ctx, env.Client, nodeClass) + nodeClass.Status.Images = []v1beta1.NodeImage{} + ExpectApplied(ctx, env.Client, nodeClass) drifted, err := cloudProvider.IsDrifted(ctx, nodeClaim) Expect(err).To(HaveOccurred()) - Expect(drifted).To(BeEmpty()) + Expect(drifted).To(Equal(NoDrift)) }) - Context("Node Image Drift", func() { - It("should trigger drift when DriftAction field is available", func() { - // Find the AKS machine that was created during BeforeEach - aksMachineID := fake.MkMachineID(testOptions.NodeResourceGroup, testOptions.ClusterName, testOptions.AKSMachinesPoolName, createInput.AKSMachineName) - - // Get the existing machine from the fake store - existingMachine, ok := azureEnv.AKSDataStorage.AKSMachines.Load(aksMachineID) - Expect(ok).To(BeTrue(), "AKS machine should exist in fake store") - - aksMachine := existingMachine - - // Set DriftAction to "Recreate" to trigger drift - if aksMachine.Properties == nil { - aksMachine.Properties = &armcontainerservice.MachineProperties{} - } - if aksMachine.Properties.Status == nil { - aksMachine.Properties.Status = &armcontainerservice.MachineStatus{} - } - aksMachine.Properties.Status.DriftAction = lo.ToPtr(armcontainerservice.DriftActionRecreate) - aksMachine.Properties.Status.DriftReason = lo.ToPtr("ClusterConfigurationChanged") - - // Update the machine in the fake store - azureEnv.AKSDataStorage.AKSMachines.Store(aksMachineID, aksMachine) - - drifted, err := cloudProvider.IsDrifted(ctx, nodeClaim) - Expect(err).ToNot(HaveOccurred()) - Expect(drifted).To(Equal(ClusterConfigDrift)) - }) + It("should trigger drift when the image version changes", func() { + test.ApplyCIGImagesWithVersion(nodeClass, "202503.02.0") + ExpectApplied(ctx, env.Client, nodeClass) + drifted, err := cloudProvider.IsDrifted(ctx, nodeClaim) + Expect(err).ToNot(HaveOccurred()) + Expect(drifted).To(Equal(ImageDrift)) }) + }) - Context("Node Image Drift", func() { - It("should succeed with no drift when nothing changes", func() { - drifted, err := cloudProvider.IsDrifted(ctx, nodeClaim) - Expect(err).ToNot(HaveOccurred()) - Expect(drifted).To(Equal(NoDrift)) - }) + Context("Kubernetes Version", func() { + It("should succeed with no drift when nothing changes", func() { + drifted, err := cloudProvider.IsDrifted(ctx, nodeClaim) + Expect(err).ToNot(HaveOccurred()) + Expect(drifted).To(Equal(NoDrift)) + }) - It("should succeed with no drift when ConditionTypeImagesReady is not true", func() { - nodeClass = ExpectExists(ctx, env.Client, nodeClass) - nodeClass.StatusConditions().SetFalse(v1beta1.ConditionTypeImagesReady, "ImagesNoLongerReady", "test when images aren't ready") - ExpectApplied(ctx, env.Client, nodeClass) - drifted, err := cloudProvider.IsDrifted(ctx, nodeClaim) - Expect(err).ToNot(HaveOccurred()) - Expect(drifted).To(Equal(NoDrift)) - }) + It("should succeed with no drift when KubernetesVersionReady is not true", func() { + nodeClass = ExpectExists(ctx, env.Client, nodeClass) + nodeClass.StatusConditions().SetFalse(v1beta1.ConditionTypeKubernetesVersionReady, "K8sVersionNoLongerReady", "test when k8s isn't ready") + ExpectApplied(ctx, env.Client, nodeClass) + drifted, err := cloudProvider.IsDrifted(ctx, nodeClaim) + Expect(err).ToNot(HaveOccurred()) + Expect(drifted).To(Equal(NoDrift)) + }) - // Note: this case shouldn't be able to happen in practice since if Images is empty ConditionTypeImagesReady should be false. - It("should error when Images are empty", func() { - nodeClass = ExpectExists(ctx, env.Client, nodeClass) - nodeClass.Status.Images = []v1beta1.NodeImage{} - ExpectApplied(ctx, env.Client, nodeClass) - drifted, err := cloudProvider.IsDrifted(ctx, nodeClaim) - Expect(err).To(HaveOccurred()) - Expect(drifted).To(Equal(NoDrift)) - }) + // TODO (charliedmcb): I'm wondering if we actually want to have these soft-error cases switch to return an error if no-drift condition was found. + It("shouldn't error or be drifted when KubernetesVersion is empty", func() { + nodeClass = ExpectExists(ctx, env.Client, nodeClass) + nodeClass.Status.KubernetesVersion = lo.ToPtr("") + ExpectApplied(ctx, env.Client, nodeClass) + drifted, err := cloudProvider.IsDrifted(ctx, nodeClaim) + Expect(err).ToNot(HaveOccurred()) + Expect(drifted).To(Equal(NoDrift)) + }) - It("should trigger drift when the image version changes", func() { - test.ApplyCIGImagesWithVersion(nodeClass, "202503.02.0") - ExpectApplied(ctx, env.Client, nodeClass) - drifted, err := cloudProvider.IsDrifted(ctx, nodeClaim) - Expect(err).ToNot(HaveOccurred()) - Expect(drifted).To(Equal(ImageDrift)) - }) + It("shouldn't error or be drifted when NodeName is missing", func() { + nodeClaim.Status.NodeName = "" + drifted, err := cloudProvider.IsDrifted(ctx, nodeClaim) + Expect(err).ToNot(HaveOccurred()) + Expect(drifted).To(Equal(NoDrift)) }) - Context("Kubernetes Version", func() { - It("should succeed with no drift when nothing changes", func() { - drifted, err := cloudProvider.IsDrifted(ctx, nodeClaim) - Expect(err).ToNot(HaveOccurred()) - Expect(drifted).To(Equal(NoDrift)) - }) + It("shouldn't error or be drifted when node is not found", func() { + nodeClaim.Status.NodeName = "NodeWhoDoesNotExist" + drifted, err := cloudProvider.IsDrifted(ctx, nodeClaim) + Expect(err).ToNot(HaveOccurred()) + Expect(drifted).To(Equal(NoDrift)) + }) - It("should succeed with no drift when KubernetesVersionReady is not true", func() { - nodeClass = ExpectExists(ctx, env.Client, nodeClass) - nodeClass.StatusConditions().SetFalse(v1beta1.ConditionTypeKubernetesVersionReady, "K8sVersionNoLongerReady", "test when k8s isn't ready") - ExpectApplied(ctx, env.Client, nodeClass) - drifted, err := cloudProvider.IsDrifted(ctx, nodeClaim) - Expect(err).ToNot(HaveOccurred()) - Expect(drifted).To(Equal(NoDrift)) - }) + It("shouldn't error or be drifted when node is deleting", func() { + node = ExpectNodeExists(ctx, env.Client, nodeClaim.Status.NodeName) + node.Finalizers = append(node.Finalizers, test.TestingFinalizer) + ExpectApplied(ctx, env.Client, node) + Expect(env.Client.Delete(ctx, node)).ToNot(HaveOccurred()) + drifted, err := cloudProvider.IsDrifted(ctx, nodeClaim) + Expect(err).ToNot(HaveOccurred()) + Expect(drifted).To(Equal(NoDrift)) - // TODO (charliedmcb): I'm wondering if we actually want to have these soft-error cases switch to return an error if no-drift condition was found. - It("shouldn't error or be drifted when KubernetesVersion is empty", func() { - nodeClass = ExpectExists(ctx, env.Client, nodeClass) - nodeClass.Status.KubernetesVersion = lo.ToPtr("") - ExpectApplied(ctx, env.Client, nodeClass) - drifted, err := cloudProvider.IsDrifted(ctx, nodeClaim) - Expect(err).ToNot(HaveOccurred()) - Expect(drifted).To(Equal(NoDrift)) + // cleanup + node = ExpectNodeExists(ctx, env.Client, nodeClaim.Status.NodeName) + deepCopy := node.DeepCopy() + node.Finalizers = lo.Reject(node.Finalizers, func(finalizer string, _ int) bool { + return finalizer == test.TestingFinalizer }) + Expect(env.Client.Patch(ctx, node, client.StrategicMergeFrom(deepCopy))).NotTo(HaveOccurred()) + ExpectDeleted(ctx, env.Client, node) + }) - It("shouldn't error or be drifted when NodeName is missing", func() { - nodeClaim.Status.NodeName = "" - drifted, err := cloudProvider.IsDrifted(ctx, nodeClaim) - Expect(err).ToNot(HaveOccurred()) - Expect(drifted).To(Equal(NoDrift)) - }) + It("should succeed with drift true when KubernetesVersion is new", func() { + nodeClass = ExpectExists(ctx, env.Client, nodeClass) - It("shouldn't error or be drifted when node is not found", func() { - nodeClaim.Status.NodeName = "NodeWhoDoesNotExist" - drifted, err := cloudProvider.IsDrifted(ctx, nodeClaim) - Expect(err).ToNot(HaveOccurred()) - Expect(drifted).To(Equal(NoDrift)) - }) + semverCurrentK8sVersion := lo.Must(semver.ParseTolerant(*nodeClass.Status.KubernetesVersion)) + semverCurrentK8sVersion.Minor = semverCurrentK8sVersion.Minor + 1 + nodeClass.Status.KubernetesVersion = lo.ToPtr(semverCurrentK8sVersion.String()) - It("shouldn't error or be drifted when node is deleting", func() { - node = ExpectNodeExists(ctx, env.Client, nodeClaim.Status.NodeName) - node.Finalizers = append(node.Finalizers, test.TestingFinalizer) - ExpectApplied(ctx, env.Client, node) - Expect(env.Client.Delete(ctx, node)).ToNot(HaveOccurred()) - drifted, err := cloudProvider.IsDrifted(ctx, nodeClaim) - Expect(err).ToNot(HaveOccurred()) - Expect(drifted).To(Equal(NoDrift)) + ExpectApplied(ctx, env.Client, nodeClass) - // cleanup - node = ExpectNodeExists(ctx, env.Client, nodeClaim.Status.NodeName) - deepCopy := node.DeepCopy() - node.Finalizers = lo.Reject(node.Finalizers, func(finalizer string, _ int) bool { - return finalizer == test.TestingFinalizer - }) - Expect(env.Client.Patch(ctx, node, client.StrategicMergeFrom(deepCopy))).NotTo(HaveOccurred()) - ExpectDeleted(ctx, env.Client, node) - }) + drifted, err := cloudProvider.IsDrifted(ctx, nodeClaim) + Expect(err).ToNot(HaveOccurred()) + Expect(drifted).To(Equal(K8sVersionDrift)) + }) + }) + }) +} - It("should succeed with drift true when KubernetesVersion is new", func() { - nodeClass = ExpectExists(ctx, env.Client, nodeClass) +var _ = Describe("CloudProvider", func() { + Context("ProvisionMode = AKSMachineAPIHeaderBatch", func() { + BeforeEach(func() { + testOptions = test.Options(test.OptionsFields{ + ProvisionMode: lo.ToPtr(consts.ProvisionModeAKSMachineAPIHeaderBatch), + UseSIG: lo.ToPtr(true), + }) - semverCurrentK8sVersion := lo.Must(semver.ParseTolerant(*nodeClass.Status.KubernetesVersion)) - semverCurrentK8sVersion.Minor = semverCurrentK8sVersion.Minor + 1 - nodeClass.Status.KubernetesVersion = lo.ToPtr(semverCurrentK8sVersion.String()) + ctx = coreoptions.ToContext(ctx, coretest.Options()) + ctx = options.ToContext(ctx, testOptions) - ExpectApplied(ctx, env.Client, nodeClass) + azureEnv = test.NewEnvironment(ctx, env) + azureEnvNonZonal = test.NewEnvironmentNonZonal(ctx, env) + statusController = status.NewController(env.Client, azureEnv.KubernetesVersionProvider, azureEnv.ImageProvider, env.KubernetesInterface, env.KubernetesInterface, azureEnv.DynamicInterface, azureEnv.SubnetsAPI, azureEnv.DiskEncryptionSetsAPI, testOptions.ParsedDiskEncryptionSetID, options.FromContext(ctx).NetworkPolicy, options.FromContext(ctx).NetworkPlugin) + test.ApplyDefaultStatus(nodeClass, env, testOptions.UseSIG) + cloudProvider = New(azureEnv.InstanceTypesProvider, azureEnv.VMInstanceProvider, azureEnv.AKSMachineProvider, recorder, env.Client, azureEnv.ImageProvider, azureEnv.InstanceTypeStore) + cloudProviderNonZonal = New(azureEnvNonZonal.InstanceTypesProvider, azureEnvNonZonal.VMInstanceProvider, azureEnvNonZonal.AKSMachineProvider, events.NewRecorder(&record.FakeRecorder{}), env.Client, azureEnvNonZonal.ImageProvider, azureEnvNonZonal.InstanceTypeStore) - drifted, err := cloudProvider.IsDrifted(ctx, nodeClaim) - Expect(err).ToNot(HaveOccurred()) - Expect(drifted).To(Equal(K8sVersionDrift)) - }) - }) + cluster = state.NewCluster(fakeClock, env.Client, cloudProvider) + clusterNonZonal = state.NewCluster(fakeClock, env.Client, cloudProviderNonZonal) + coreProvisioner = provisioning.NewProvisioner(env.Client, recorder, cloudProvider, cluster, fakeClock) + coreProvisionerNonZonal = provisioning.NewProvisioner(env.Client, recorder, cloudProviderNonZonal, clusterNonZonal, fakeClock) + + ExpectApplied(ctx, env.Client, nodeClass, nodePool) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + }) + + AfterEach(func() { + // Wait for any async polling goroutines to complete before resetting + cloudProvider.WaitForInstancePromises() + cluster.Reset() + azureEnv.Reset(ctx) + azureEnvNonZonal.Reset(ctx) }) + + runAKSMachineAPIDriftTests() }) // Attention: tests under "ProvisionMode = AKSScriptless" are not applicable to ProvisionMode = AKSMachineAPI option. diff --git a/pkg/cloudprovider/suite_features_test.go b/pkg/cloudprovider/suite_features_test.go index ef587d1d3..d277a8f3e 100644 --- a/pkg/cloudprovider/suite_features_test.go +++ b/pkg/cloudprovider/suite_features_test.go @@ -46,107 +46,40 @@ import ( "github.com/Azure/karpenter-provider-azure/pkg/utils" ) -var _ = Describe("CloudProvider", func() { - Context("ProvisionMode = AKSMachineAPIHeaderBatch", func() { - BeforeEach(func() { - testOptions = test.Options(test.OptionsFields{ - ProvisionMode: lo.ToPtr(consts.ProvisionModeAKSMachineAPIHeaderBatch), - UseSIG: lo.ToPtr(true), - }) - - ctx = coreoptions.ToContext(ctx, coretest.Options()) - ctx = options.ToContext(ctx, testOptions) - - azureEnv = test.NewEnvironment(ctx, env) - azureEnvNonZonal = test.NewEnvironmentNonZonal(ctx, env) - statusController = status.NewController(env.Client, azureEnv.KubernetesVersionProvider, azureEnv.ImageProvider, env.KubernetesInterface, env.KubernetesInterface, azureEnv.DynamicInterface, azureEnv.SubnetsAPI, azureEnv.DiskEncryptionSetsAPI, testOptions.ParsedDiskEncryptionSetID, options.FromContext(ctx).NetworkPolicy, options.FromContext(ctx).NetworkPlugin) - test.ApplyDefaultStatus(nodeClass, env, testOptions.UseSIG) - cloudProvider = New(azureEnv.InstanceTypesProvider, azureEnv.VMInstanceProvider, azureEnv.AKSMachineProvider, recorder, env.Client, azureEnv.ImageProvider, azureEnv.InstanceTypeStore) - cloudProviderNonZonal = New(azureEnvNonZonal.InstanceTypesProvider, azureEnvNonZonal.VMInstanceProvider, azureEnvNonZonal.AKSMachineProvider, events.NewRecorder(&record.FakeRecorder{}), env.Client, azureEnvNonZonal.ImageProvider, azureEnvNonZonal.InstanceTypeStore) - - cluster = state.NewCluster(fakeClock, env.Client, cloudProvider) - clusterNonZonal = state.NewCluster(fakeClock, env.Client, cloudProviderNonZonal) - coreProvisioner = provisioning.NewProvisioner(env.Client, recorder, cloudProvider, cluster, fakeClock) - coreProvisionerNonZonal = provisioning.NewProvisioner(env.Client, recorder, cloudProviderNonZonal, clusterNonZonal, fakeClock) - - ExpectApplied(ctx, env.Client, nodeClass, nodePool) - ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) - }) - - AfterEach(func() { - // Wait for any async polling goroutines to complete before resetting - cloudProvider.WaitForInstancePromises() +func runAKSMachineAPIFeatureTests() { + // Mostly ported from VM test: "ImageReference" and "ImageProvider + Image Family" + // Note: AKS Machine API does not support Community Image Gallery (CIG) + Context("Create - ImageReference and ImageProvider + Image Family", func() { + + // Ported from VM test: "should use shared image gallery images when options are set to UseSIG" + It("should use shared image gallery images", func() { + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + pod := coretest.UnschedulablePod(coretest.PodOptions{}) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + + // Expect AKS machine to have a shared image gallery reference set via NodeImageVersion + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := createInput.AKSMachine + Expect(aksMachine.Properties.NodeImageVersion).ToNot(BeNil()) + + // NodeImageVersion should contain SIG identifier and subscription ID (converted from ImageReference.ID) + nodeImageVersion := lo.FromPtr(aksMachine.Properties.NodeImageVersion) + Expect(nodeImageVersion).To(ContainSubstring("AKSUbuntu")) + Expect(nodeImageVersion).To(MatchRegexp(`^AKSUbuntu-.*-.*$`)) // Format: AKSUbuntu-- + + // Clean up cluster.Reset() azureEnv.Reset(ctx) - azureEnvNonZonal.Reset(ctx) }) - // Mostly ported from VM test: "ImageReference" and "ImageProvider + Image Family" - // Note: AKS Machine API does not support Community Image Gallery (CIG) - Context("Create - ImageReference and ImageProvider + Image Family", func() { - - // Ported from VM test: "should use shared image gallery images when options are set to UseSIG" - It("should use shared image gallery images", func() { - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - - // Expect AKS machine to have a shared image gallery reference set via NodeImageVersion - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine - Expect(aksMachine.Properties.NodeImageVersion).ToNot(BeNil()) - - // NodeImageVersion should contain SIG identifier and subscription ID (converted from ImageReference.ID) - nodeImageVersion := lo.FromPtr(aksMachine.Properties.NodeImageVersion) - Expect(nodeImageVersion).To(ContainSubstring("AKSUbuntu")) - Expect(nodeImageVersion).To(MatchRegexp(`^AKSUbuntu-.*-.*$`)) // Format: AKSUbuntu-- - // Clean up - cluster.Reset() - azureEnv.Reset(ctx) - }) - - // Note: Community Images tests are not ported since Community Images are not supported for AKS Machine API - // This aligns with the warning in utils.GetAKSMachineNodeImageVersionFromImageID() - - // Ported from VM test DescribeTable: "should select the right Shared Image Gallery image for a given instance type" - DescribeTable("should select the right Shared Image Gallery NodeImageVersion for a given instance type", - func(instanceType string, imageFamily string, expectedImageDefinition string) { - nodeClass.Spec.ImageFamily = lo.ToPtr(imageFamily) - coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1.LabelInstanceTypeStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{instanceType}}) - - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine - Expect(aksMachine.Properties.NodeImageVersion).ToNot(BeNil()) - - // NodeImageVersion should contain the expected image definition - nodeImageVersion := lo.FromPtr(aksMachine.Properties.NodeImageVersion) - Expect(nodeImageVersion).To(ContainSubstring(expectedImageDefinition)) - }, - // Ported entries from VM test, covering SIG images for different generations and architectures - Entry("Gen2, Gen1 instance type with AKSUbuntu image family", "Standard_D2_v5", v1beta1.Ubuntu2204ImageFamily, imagefamily.Ubuntu2204Gen2ImageDefinition), - Entry("Gen1 instance type with AKSUbuntu image family", "Standard_D2_v3", v1beta1.Ubuntu2204ImageFamily, imagefamily.Ubuntu2204Gen1ImageDefinition), - Entry("ARM instance type with AKSUbuntu image family", "Standard_D16plds_v5", v1beta1.Ubuntu2204ImageFamily, imagefamily.Ubuntu2204Gen2ArmImageDefinition), - ) - - It("should select the right Shared Image Gallery NodeImageVersion for a given instance type, Gen2 instance type with AzureLinux image family", func() { - instanceType := "Standard_D2_v5" - imageFamily := v1beta1.AzureLinuxImageFamily - kubernetesVersion := lo.Must(env.KubernetesInterface.Discovery().ServerVersion()).String() - expectUseAzureLinux3 := imagefamily.UseAzureLinux3(kubernetesVersion) - expectedImageDefinition := lo.Ternary(expectUseAzureLinux3, imagefamily.AzureLinux3Gen2ImageDefinition, imagefamily.AzureLinuxGen2ImageDefinition) + // Note: Community Images tests are not ported since Community Images are not supported for AKS Machine API + // This aligns with the warning in utils.GetAKSMachineNodeImageVersionFromImageID() + // Ported from VM test DescribeTable: "should select the right Shared Image Gallery image for a given instance type" + DescribeTable("should select the right Shared Image Gallery NodeImageVersion for a given instance type", + func(instanceType string, imageFamily string, expectedImageDefinition string) { nodeClass.Spec.ImageFamily = lo.ToPtr(imageFamily) coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ Key: v1.LabelInstanceTypeStable, @@ -167,1053 +100,1124 @@ var _ = Describe("CloudProvider", func() { // NodeImageVersion should contain the expected image definition nodeImageVersion := lo.FromPtr(aksMachine.Properties.NodeImageVersion) Expect(nodeImageVersion).To(ContainSubstring(expectedImageDefinition)) - }) - - It("should select the right Shared Image Gallery NodeImageVersion for a given instance type, Gen1 instance type with AzureLinux image family", func() { - instanceType := "Standard_D2_v3" - imageFamily := v1beta1.AzureLinuxImageFamily - kubernetesVersion := lo.Must(env.KubernetesInterface.Discovery().ServerVersion()).String() - expectUseAzureLinux3 := imagefamily.UseAzureLinux3(kubernetesVersion) - expectedImageDefinition := lo.Ternary(expectUseAzureLinux3, imagefamily.AzureLinux3Gen1ImageDefinition, imagefamily.AzureLinuxGen1ImageDefinition) + }, + // Ported entries from VM test, covering SIG images for different generations and architectures + Entry("Gen2, Gen1 instance type with AKSUbuntu image family", "Standard_D2_v5", v1beta1.Ubuntu2204ImageFamily, imagefamily.Ubuntu2204Gen2ImageDefinition), + Entry("Gen1 instance type with AKSUbuntu image family", "Standard_D2_v3", v1beta1.Ubuntu2204ImageFamily, imagefamily.Ubuntu2204Gen1ImageDefinition), + Entry("ARM instance type with AKSUbuntu image family", "Standard_D16plds_v5", v1beta1.Ubuntu2204ImageFamily, imagefamily.Ubuntu2204Gen2ArmImageDefinition), + ) - nodeClass.Spec.ImageFamily = lo.ToPtr(imageFamily) - coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1.LabelInstanceTypeStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{instanceType}}) + It("should select the right Shared Image Gallery NodeImageVersion for a given instance type, Gen2 instance type with AzureLinux image family", func() { + instanceType := "Standard_D2_v5" + imageFamily := v1beta1.AzureLinuxImageFamily + kubernetesVersion := lo.Must(env.KubernetesInterface.Discovery().ServerVersion()).String() + expectUseAzureLinux3 := imagefamily.UseAzureLinux3(kubernetesVersion) + expectedImageDefinition := lo.Ternary(expectUseAzureLinux3, imagefamily.AzureLinux3Gen2ImageDefinition, imagefamily.AzureLinuxGen2ImageDefinition) - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) + nodeClass.Spec.ImageFamily = lo.ToPtr(imageFamily) + coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ + Key: v1.LabelInstanceTypeStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{instanceType}}) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine - Expect(aksMachine.Properties.NodeImageVersion).ToNot(BeNil()) + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + pod := coretest.UnschedulablePod(coretest.PodOptions{}) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := createInput.AKSMachine + Expect(aksMachine.Properties.NodeImageVersion).ToNot(BeNil()) + + // NodeImageVersion should contain the expected image definition + nodeImageVersion := lo.FromPtr(aksMachine.Properties.NodeImageVersion) + Expect(nodeImageVersion).To(ContainSubstring(expectedImageDefinition)) + }) - // NodeImageVersion should contain the expected image definition - nodeImageVersion := lo.FromPtr(aksMachine.Properties.NodeImageVersion) - Expect(nodeImageVersion).To(ContainSubstring(expectedImageDefinition)) - }) + It("should select the right Shared Image Gallery NodeImageVersion for a given instance type, Gen1 instance type with AzureLinux image family", func() { + instanceType := "Standard_D2_v3" + imageFamily := v1beta1.AzureLinuxImageFamily + kubernetesVersion := lo.Must(env.KubernetesInterface.Discovery().ServerVersion()).String() + expectUseAzureLinux3 := imagefamily.UseAzureLinux3(kubernetesVersion) + expectedImageDefinition := lo.Ternary(expectUseAzureLinux3, imagefamily.AzureLinux3Gen1ImageDefinition, imagefamily.AzureLinuxGen1ImageDefinition) - It("should select the right Shared Image Gallery NodeImageVersion for a given instance type, ARM instance type with AzureLinux image family", func() { - instanceType := "Standard_D16plds_v5" - imageFamily := v1beta1.AzureLinuxImageFamily - kubernetesVersion := lo.Must(env.KubernetesInterface.Discovery().ServerVersion()).String() - expectUseAzureLinux3 := imagefamily.UseAzureLinux3(kubernetesVersion) - expectedImageDefinition := lo.Ternary(expectUseAzureLinux3, imagefamily.AzureLinux3Gen2ArmImageDefinition, imagefamily.AzureLinuxGen2ArmImageDefinition) + nodeClass.Spec.ImageFamily = lo.ToPtr(imageFamily) + coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ + Key: v1.LabelInstanceTypeStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{instanceType}}) - nodeClass.Spec.ImageFamily = lo.ToPtr(imageFamily) - coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1.LabelInstanceTypeStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{instanceType}}) + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + pod := coretest.UnschedulablePod(coretest.PodOptions{}) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := createInput.AKSMachine + Expect(aksMachine.Properties.NodeImageVersion).ToNot(BeNil()) + + // NodeImageVersion should contain the expected image definition + nodeImageVersion := lo.FromPtr(aksMachine.Properties.NodeImageVersion) + Expect(nodeImageVersion).To(ContainSubstring(expectedImageDefinition)) + }) - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) + It("should select the right Shared Image Gallery NodeImageVersion for a given instance type, ARM instance type with AzureLinux image family", func() { + instanceType := "Standard_D16plds_v5" + imageFamily := v1beta1.AzureLinuxImageFamily + kubernetesVersion := lo.Must(env.KubernetesInterface.Discovery().ServerVersion()).String() + expectUseAzureLinux3 := imagefamily.UseAzureLinux3(kubernetesVersion) + expectedImageDefinition := lo.Ternary(expectUseAzureLinux3, imagefamily.AzureLinux3Gen2ArmImageDefinition, imagefamily.AzureLinuxGen2ArmImageDefinition) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine - Expect(aksMachine.Properties.NodeImageVersion).ToNot(BeNil()) + nodeClass.Spec.ImageFamily = lo.ToPtr(imageFamily) + coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ + Key: v1.LabelInstanceTypeStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{instanceType}}) - // NodeImageVersion should contain the expected image definition - nodeImageVersion := lo.FromPtr(aksMachine.Properties.NodeImageVersion) - Expect(nodeImageVersion).To(ContainSubstring(expectedImageDefinition)) + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + pod := coretest.UnschedulablePod(coretest.PodOptions{}) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) - // Clean up - cluster.Reset() - azureEnv.Reset(ctx) - }) - }) + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := createInput.AKSMachine + Expect(aksMachine.Properties.NodeImageVersion).ToNot(BeNil()) - // Ported from VM test: "GPU Workloads + Nodes" - Context("Create - GPU Workloads + Nodes", func() { - // Ported from VM test: "should schedule non-GPU pod onto the cheapest non-GPU capable node" - It("should schedule non-GPU pod onto the cheapest non-GPU capable node", func() { - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - node := ExpectScheduled(ctx, env.Client, pod) + // NodeImageVersion should contain the expected image definition + nodeImageVersion := lo.FromPtr(aksMachine.Properties.NodeImageVersion) + Expect(nodeImageVersion).To(ContainSubstring(expectedImageDefinition)) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine - Expect(aksMachine.Properties).ToNot(BeNil()) - Expect(aksMachine.Properties.Hardware).ToNot(BeNil()) - Expect(aksMachine.Properties.Hardware.VMSize).ToNot(BeNil()) - Expect(utils.IsNvidiaEnabledSKU(lo.FromPtr(aksMachine.Properties.Hardware.VMSize))).To(BeFalse()) + // Clean up + cluster.Reset() + azureEnv.Reset(ctx) + }) + }) - Expect(node.Labels).To(HaveKeyWithValue("karpenter.azure.com/sku-gpu-count", "0")) - }) + // Ported from VM test: "GPU Workloads + Nodes" + Context("Create - GPU Workloads + Nodes", func() { + // Ported from VM test: "should schedule non-GPU pod onto the cheapest non-GPU capable node" + It("should schedule non-GPU pod onto the cheapest non-GPU capable node", func() { + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + pod := coretest.UnschedulablePod(coretest.PodOptions{}) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + node := ExpectScheduled(ctx, env.Client, pod) + + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := createInput.AKSMachine + Expect(aksMachine.Properties).ToNot(BeNil()) + Expect(aksMachine.Properties.Hardware).ToNot(BeNil()) + Expect(aksMachine.Properties.Hardware.VMSize).ToNot(BeNil()) + Expect(utils.IsNvidiaEnabledSKU(lo.FromPtr(aksMachine.Properties.Hardware.VMSize))).To(BeFalse()) + + Expect(node.Labels).To(HaveKeyWithValue("karpenter.azure.com/sku-gpu-count", "0")) + }) - // Ported from VM test: "should schedule GPU pod on GPU capable node" - It("should schedule GPU pod on GPU capable node", func() { - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod(coretest.PodOptions{ - ObjectMeta: metav1.ObjectMeta{ - Name: "samples-tf-mnist-demo", - Labels: map[string]string{ - "app": "samples-tf-mnist-demo", - }, + // Ported from VM test: "should schedule GPU pod on GPU capable node" + It("should schedule GPU pod on GPU capable node", func() { + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + pod := coretest.UnschedulablePod(coretest.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Name: "samples-tf-mnist-demo", + Labels: map[string]string{ + "app": "samples-tf-mnist-demo", }, - Image: "mcr.microsoft.com/azuredocs/samples-tf-mnist-demo:gpu", - ResourceRequirements: v1.ResourceRequirements{ - Limits: v1.ResourceList{ - "nvidia.com/gpu": resource.MustParse("1"), - }, + }, + Image: "mcr.microsoft.com/azuredocs/samples-tf-mnist-demo:gpu", + ResourceRequirements: v1.ResourceRequirements{ + Limits: v1.ResourceList{ + "nvidia.com/gpu": resource.MustParse("1"), }, - RestartPolicy: v1.RestartPolicy("OnFailure"), - Tolerations: []v1.Toleration{ - { - Key: "sku", - Operator: v1.TolerationOpEqual, - Value: "gpu", - Effect: v1.TaintEffectNoSchedule, - }, + }, + RestartPolicy: v1.RestartPolicy("OnFailure"), + Tolerations: []v1.Toleration{ + { + Key: "sku", + Operator: v1.TolerationOpEqual, + Value: "gpu", + Effect: v1.TaintEffectNoSchedule, }, - }) - - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - node := ExpectScheduled(ctx, env.Client, pod) - - // the following checks assume Standard_NC16as_T4_v3 (surprisingly the cheapest GPU in the test set), so test the assumption - Expect(node.Labels).To(HaveKeyWithValue("node.kubernetes.io/instance-type", "Standard_NC16as_T4_v3")) - - // Verify AKS machine GPU selection - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine - Expect(aksMachine.Properties).ToNot(BeNil()) - Expect(aksMachine.Properties.Hardware).ToNot(BeNil()) - Expect(aksMachine.Properties.Hardware.VMSize).ToNot(BeNil()) - vmSize := lo.FromPtr(aksMachine.Properties.Hardware.VMSize) - Expect(utils.IsNvidiaEnabledSKU(vmSize)).To(BeTrue()) - - // Verify that the node the pod was scheduled on has GPU resource and labels set - Expect(node.Status.Allocatable).To(HaveKeyWithValue(v1.ResourceName("nvidia.com/gpu"), resource.MustParse("1"))) - Expect(node.Labels).To(HaveKeyWithValue("karpenter.azure.com/sku-gpu-name", "T4")) - Expect(node.Labels).To(HaveKeyWithValue("karpenter.azure.com/sku-gpu-manufacturer", v1beta1.ManufacturerNvidia)) - Expect(node.Labels).To(HaveKeyWithValue("karpenter.azure.com/sku-gpu-count", "1")) + }, }) - }) - // Ported from VM test: Context "additional-tags" - Context("Create - Additional Tags", func() { - It("should add additional tags to the AKS machine", func() { - // Set up test context with additional tags - aksTestOptions := test.Options(test.OptionsFields{ - ProvisionMode: lo.ToPtr(consts.ProvisionModeAKSMachineAPIHeaderBatch), - UseSIG: lo.ToPtr(true), - AdditionalTags: map[string]string{ - "karpenter.azure.com/test-tag": "test-value", - }, - }) - aksCtx := coreoptions.ToContext(ctx, coretest.Options()) - aksCtx = options.ToContext(aksCtx, aksTestOptions) - - aksAzureEnv := test.NewEnvironment(aksCtx, env) - test.ApplyDefaultStatus(nodeClass, env, aksTestOptions.UseSIG) - aksCloudProvider := New(aksAzureEnv.InstanceTypesProvider, aksAzureEnv.VMInstanceProvider, aksAzureEnv.AKSMachineProvider, recorder, env.Client, aksAzureEnv.ImageProvider, aksAzureEnv.InstanceTypeStore) - aksCluster := state.NewCluster(fakeClock, env.Client, aksCloudProvider) - aksProv := provisioning.NewProvisioner(env.Client, recorder, aksCloudProvider, aksCluster, fakeClock) - - ExpectApplied(aksCtx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(aksCtx, env.Client, aksCluster, aksCloudProvider, aksProv, aksAzureEnv, pod) - ExpectScheduled(aksCtx, env.Client, pod) - - // Verify AKS machine was created with expected tags - Expect(aksAzureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - input := aksAzureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := input.AKSMachine - Expect(aksMachine).ToNot(BeNil()) - Expect(aksMachine.Properties.Tags).To(HaveKey("karpenter.azure.com_test-tag")) - Expect(*aksMachine.Properties.Tags["karpenter.azure.com_test-tag"]).To(Equal("test-value")) - Expect(aksMachine.Properties.Tags).To(HaveKey("karpenter.azure.com_cluster")) - Expect(*aksMachine.Properties.Tags["karpenter.azure.com_cluster"]).To(Equal("test-cluster")) - Expect(aksMachine.Properties.Tags).To(HaveKey("compute.aks.billing")) - Expect(*aksMachine.Properties.Tags["compute.aks.billing"]).To(Equal("linux")) - Expect(aksMachine.Properties.Tags).To(HaveKey("karpenter.sh_nodepool")) - Expect(*aksMachine.Properties.Tags["karpenter.sh_nodepool"]).To(Equal(nodePool.Name)) - Expect(aksMachine.Properties.Tags).To(HaveKey("karpenter.azure.com_aksmachine_nodeclaim")) - - // Clean up - aksCluster.Reset() - aksAzureEnv.Reset(ctx) - }) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + node := ExpectScheduled(ctx, env.Client, pod) + + // the following checks assume Standard_NC16as_T4_v3 (surprisingly the cheapest GPU in the test set), so test the assumption + Expect(node.Labels).To(HaveKeyWithValue("node.kubernetes.io/instance-type", "Standard_NC16as_T4_v3")) + + // Verify AKS machine GPU selection + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := createInput.AKSMachine + Expect(aksMachine.Properties).ToNot(BeNil()) + Expect(aksMachine.Properties.Hardware).ToNot(BeNil()) + Expect(aksMachine.Properties.Hardware.VMSize).ToNot(BeNil()) + vmSize := lo.FromPtr(aksMachine.Properties.Hardware.VMSize) + Expect(utils.IsNvidiaEnabledSKU(vmSize)).To(BeTrue()) + + // Verify that the node the pod was scheduled on has GPU resource and labels set + Expect(node.Status.Allocatable).To(HaveKeyWithValue(v1.ResourceName("nvidia.com/gpu"), resource.MustParse("1"))) + Expect(node.Labels).To(HaveKeyWithValue("karpenter.azure.com/sku-gpu-name", "T4")) + Expect(node.Labels).To(HaveKeyWithValue("karpenter.azure.com/sku-gpu-manufacturer", v1beta1.ManufacturerNvidia)) + Expect(node.Labels).To(HaveKeyWithValue("karpenter.azure.com/sku-gpu-count", "1")) }) + }) - // Mostly ported from VM test: Context "Ephemeral Disk" - // Note: AKS Machine API has simpler disk configuration compared to VM API - // - VMs control detailed StorageProfile, DiffDiskSettings, Placement (NVMe/Cache) - // - AKS machines use OSDiskType (Managed/Ephemeral) and OSDiskSizeGB - // - AKS machines automatically handles placement decisions (NVMe vs Cache disk) - Context("Create - Ephemeral Disk", func() { - // Ported from VM test: "should use ephemeral disk if supported, and has space of at least 128GB by default" - It("should use ephemeral disk if supported, and has space of at least 128GB by default", func() { - // Select a SKU that supports ephemeral disks with sufficient space - nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1.LabelInstanceTypeStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{"Standard_D64s_v3"}, // Has large cache disk space, - }) - - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - - // Verify AKS machine uses ephemeral disk - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine - Expect(aksMachine.Properties.OperatingSystem).ToNot(BeNil()) - Expect(aksMachine.Properties.OperatingSystem.OSDiskType).ToNot(BeNil()) - Expect(*aksMachine.Properties.OperatingSystem.OSDiskType).To(Equal(armcontainerservice.OSDiskTypeEphemeral)) - }) - - // Ported from VM test: "should fail to provision if ephemeral disk ask for is too large" - It("should fail to provision if ephemeral disk ask for is too large", func() { - nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1beta1.LabelSKUStorageEphemeralOSMaxSize, - Operator: v1.NodeSelectorOpGt, - Values: []string{"100000"}, - }) // No InstanceType will match this requirement - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectNotScheduled(ctx, env.Client, pod) - + // Ported from VM test: Context "additional-tags" + Context("Create - Additional Tags", func() { + It("should add additional tags to the AKS machine", func() { + // Set up test context with additional tags + aksTestOptions := test.Options(test.OptionsFields{ + ProvisionMode: lo.ToPtr(consts.ProvisionModeAKSMachineAPIHeaderBatch), + UseSIG: lo.ToPtr(true), + AdditionalTags: map[string]string{ + "karpenter.azure.com/test-tag": "test-value", + }, }) + aksCtx := coreoptions.ToContext(ctx, coretest.Options()) + aksCtx = options.ToContext(aksCtx, aksTestOptions) + + aksAzureEnv := test.NewEnvironment(aksCtx, env) + test.ApplyDefaultStatus(nodeClass, env, aksTestOptions.UseSIG) + aksCloudProvider := New(aksAzureEnv.InstanceTypesProvider, aksAzureEnv.VMInstanceProvider, aksAzureEnv.AKSMachineProvider, recorder, env.Client, aksAzureEnv.ImageProvider, aksAzureEnv.InstanceTypeStore) + aksCluster := state.NewCluster(fakeClock, env.Client, aksCloudProvider) + aksProv := provisioning.NewProvisioner(env.Client, recorder, aksCloudProvider, aksCluster, fakeClock) + + ExpectApplied(aksCtx, env.Client, nodePool, nodeClass) + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(aksCtx, env.Client, aksCluster, aksCloudProvider, aksProv, aksAzureEnv, pod) + ExpectScheduled(aksCtx, env.Client, pod) + + // Verify AKS machine was created with expected tags + Expect(aksAzureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + input := aksAzureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := input.AKSMachine + Expect(aksMachine).ToNot(BeNil()) + Expect(aksMachine.Properties.Tags).To(HaveKey("karpenter.azure.com_test-tag")) + Expect(*aksMachine.Properties.Tags["karpenter.azure.com_test-tag"]).To(Equal("test-value")) + Expect(aksMachine.Properties.Tags).To(HaveKey("karpenter.azure.com_cluster")) + Expect(*aksMachine.Properties.Tags["karpenter.azure.com_cluster"]).To(Equal("test-cluster")) + Expect(aksMachine.Properties.Tags).To(HaveKey("compute.aks.billing")) + Expect(*aksMachine.Properties.Tags["compute.aks.billing"]).To(Equal("linux")) + Expect(aksMachine.Properties.Tags).To(HaveKey("karpenter.sh_nodepool")) + Expect(*aksMachine.Properties.Tags["karpenter.sh_nodepool"]).To(Equal(nodePool.Name)) + Expect(aksMachine.Properties.Tags).To(HaveKey("karpenter.azure.com_aksmachine_nodeclaim")) + + // Clean up + aksCluster.Reset() + aksAzureEnv.Reset(ctx) + }) + }) - // Ported from VM test: should select an ephemeral disk if LabelSKUStorageEphemeralOSMaxSize is set and os disk size fits - It("should select an ephemeral disk if LabelSKUStorageEphemeralOSMaxSize is set and os disk size fits", func() { - // Select instances that support ephemeral disks - nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1beta1.LabelSKUStorageEphemeralOSMaxSize, - Operator: v1.NodeSelectorOpGt, - Values: []string{"0"}, - }) - nodeClass.Spec.OSDiskSizeGB = lo.ToPtr[int32](30) - - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - - // Should select a SKU with ephemeral capability - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine - Expect(aksMachine.Properties.OperatingSystem.OSDiskType).ToNot(BeNil()) - // Should use ephemeral since we required sufficient ephemeral storage - Expect(*aksMachine.Properties.OperatingSystem.OSDiskType).To(Equal(armcontainerservice.OSDiskTypeEphemeral)) - Expect(*aksMachine.Properties.OperatingSystem.OSDiskSizeGB).To(Equal(int32(30))) + // Mostly ported from VM test: Context "Ephemeral Disk" + // Note: AKS Machine API has simpler disk configuration compared to VM API + // - VMs control detailed StorageProfile, DiffDiskSettings, Placement (NVMe/Cache) + // - AKS machines use OSDiskType (Managed/Ephemeral) and OSDiskSizeGB + // - AKS machines automatically handles placement decisions (NVMe vs Cache disk) + Context("Create - Ephemeral Disk", func() { + // Ported from VM test: "should use ephemeral disk if supported, and has space of at least 128GB by default" + It("should use ephemeral disk if supported, and has space of at least 128GB by default", func() { + // Select a SKU that supports ephemeral disks with sufficient space + nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, karpv1.NodeSelectorRequirementWithMinValues{ + Key: v1.LabelInstanceTypeStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{"Standard_D64s_v3"}, // Has large cache disk space, }) - // Ported from VM test: "should use ephemeral disk if supported, and set disk size to OSDiskSizeGB from node class" - It("should use ephemeral disk if supported, and set disk size to OSDiskSizeGB from node class", func() { - // Configure specific OS disk size in NodeClass - nodeClass.Spec.OSDiskSizeGB = lo.ToPtr(int32(256)) + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + + // Verify AKS machine uses ephemeral disk + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine + Expect(aksMachine.Properties.OperatingSystem).ToNot(BeNil()) + Expect(aksMachine.Properties.OperatingSystem.OSDiskType).ToNot(BeNil()) + Expect(*aksMachine.Properties.OperatingSystem.OSDiskType).To(Equal(armcontainerservice.OSDiskTypeEphemeral)) + }) - // Select an instance type that supports the disk size - nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1.LabelInstanceTypeStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{"Standard_D64s_v3"}, - }) + // Ported from VM test: "should fail to provision if ephemeral disk ask for is too large" + It("should fail to provision if ephemeral disk ask for is too large", func() { + nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, karpv1.NodeSelectorRequirementWithMinValues{ + Key: v1beta1.LabelSKUStorageEphemeralOSMaxSize, + Operator: v1.NodeSelectorOpGt, + Values: []string{"100000"}, + }) // No InstanceType will match this requirement + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectNotScheduled(ctx, env.Client, pod) - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) + }) - // Verify AKS machine was created with correct OS disk size - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine - Expect(aksMachine.Properties.OperatingSystem).ToNot(BeNil()) - Expect(aksMachine.Properties.OperatingSystem.OSDiskSizeGB).ToNot(BeNil()) - Expect(*aksMachine.Properties.OperatingSystem.OSDiskSizeGB).To(Equal(int32(256))) - Expect(*aksMachine.Properties.OperatingSystem.OSDiskType).To(Equal(armcontainerservice.OSDiskTypeEphemeral)) + // Ported from VM test: should select an ephemeral disk if LabelSKUStorageEphemeralOSMaxSize is set and os disk size fits + It("should select an ephemeral disk if LabelSKUStorageEphemeralOSMaxSize is set and os disk size fits", func() { + // Select instances that support ephemeral disks + nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, karpv1.NodeSelectorRequirementWithMinValues{ + Key: v1beta1.LabelSKUStorageEphemeralOSMaxSize, + Operator: v1.NodeSelectorOpGt, + Values: []string{"0"}, }) + nodeClass.Spec.OSDiskSizeGB = lo.ToPtr[int32](30) - // Ported from VM test: "should not use ephemeral disk if ephemeral is supported, but we don't have enough space" - It("should not use ephemeral disk if ephemeral is supported, but we don't have enough space", func() { - // Select Standard_D2s_v3 which supports ephemeral but has limited space - // Standard_D2s_V3 has 53GB Of CacheDisk space and 16GB of Temp Disk Space. - // With our rule of 128GB being the minimum OSDiskSize, this should fall back to managed disk - nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1.LabelInstanceTypeStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{"Standard_D2s_v3"}, - }) - - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - - // Should fall back to managed disk due to insufficient ephemeral space - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine - Expect(aksMachine.Properties.OperatingSystem).ToNot(BeNil()) - Expect(aksMachine.Properties.OperatingSystem.OSDiskType).ToNot(BeNil()) - Expect(*aksMachine.Properties.OperatingSystem.OSDiskType).To(Equal(armcontainerservice.OSDiskTypeManaged)) - Expect(aksMachine.Properties.OperatingSystem.OSDiskSizeGB).ToNot(BeNil()) - Expect(*aksMachine.Properties.OperatingSystem.OSDiskSizeGB).To(Equal(int32(128))) // Default size - }) + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + + // Should select a SKU with ephemeral capability + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine + Expect(aksMachine.Properties.OperatingSystem.OSDiskType).ToNot(BeNil()) + // Should use ephemeral since we required sufficient ephemeral storage + Expect(*aksMachine.Properties.OperatingSystem.OSDiskType).To(Equal(armcontainerservice.OSDiskTypeEphemeral)) + Expect(*aksMachine.Properties.OperatingSystem.OSDiskSizeGB).To(Equal(int32(30))) }) - Context("Create - Additional Configurations", func() { - It("should handle configured NodeClass", func() { - // Configure comprehensive NodeClass settings - nodeClass.Spec.Kubelet = &v1beta1.KubeletConfiguration{ - CPUManagerPolicy: lo.ToPtr("static"), - CPUCFSQuota: lo.ToPtr(true), - ImageGCHighThresholdPercent: lo.ToPtr(int32(85)), - ImageGCLowThresholdPercent: lo.ToPtr(int32(80)), - FailSwapOn: lo.ToPtr(false), - } - nodeClass.Spec.ImageFamily = lo.ToPtr(v1beta1.Ubuntu2204ImageFamily) - - // Override context to use a BYO VNet instead of managed VNet - // This allows testing custom subnet configuration (managed VNet doesn't allow custom subnets) - byoClusterSubnetID := "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resourceGroup/providers/Microsoft.Network/virtualNetworks/byo-vnet-customname/subnets/cluster-subnet" - byoOpts := test.Options(test.OptionsFields{ - ProvisionMode: lo.ToPtr(consts.ProvisionModeAKSMachineAPIHeaderBatch), - UseSIG: lo.ToPtr(true), - SubnetID: lo.ToPtr(byoClusterSubnetID), - }) - byoCtx := options.ToContext(ctx, byoOpts) - - // Extract cluster subnet components and create a test subnet in the same VNet - clusterSubnetComponents, err := utils.GetVnetSubnetIDComponents(byoClusterSubnetID) - Expect(err).ToNot(HaveOccurred()) - testSubnetID := fmt.Sprintf("/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Network/virtualNetworks/%s/subnets/nodeclass-subnet", - clusterSubnetComponents.SubscriptionID, clusterSubnetComponents.ResourceGroupName, clusterSubnetComponents.VNetName) - nodeClass.Spec.VNETSubnetID = lo.ToPtr(testSubnetID) - nodeClass.Spec.Tags = map[string]string{ - "custom-tag": "custom-value", - "environment": "test", - "team": "platform", - } - nodeClass.Spec.OSDiskSizeGB = lo.ToPtr(int32(100)) - - // Configure GPU workload to test GPU node selection - pod := coretest.UnschedulablePod(coretest.PodOptions{ - ResourceRequirements: v1.ResourceRequirements{ - Requests: v1.ResourceList{ - "nvidia.com/gpu": resource.MustParse("1"), - }, - Limits: v1.ResourceList{ - "nvidia.com/gpu": resource.MustParse("1"), - }, - }, - }) - - ExpectApplied(byoCtx, env.Client, nodePool, nodeClass) - ExpectObjectReconciled(byoCtx, env.Client, statusController, nodeClass) - ExpectProvisionedAndWaitForPromises(byoCtx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(byoCtx, env.Client, pod) + // Ported from VM test: "should use ephemeral disk if supported, and set disk size to OSDiskSizeGB from node class" + It("should use ephemeral disk if supported, and set disk size to OSDiskSizeGB from node class", func() { + // Configure specific OS disk size in NodeClass + nodeClass.Spec.OSDiskSizeGB = lo.ToPtr(int32(256)) - // Verify AKS machine was created - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - input := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := input.AKSMachine - - // Verify kubelet configuration - Expect(aksMachine.Properties.Kubernetes.KubeletConfig).ToNot(BeNil()) - Expect(*aksMachine.Properties.Kubernetes.KubeletConfig.CPUManagerPolicy).To(Equal("static")) - Expect(*aksMachine.Properties.Kubernetes.KubeletConfig.CPUCfsQuota).To(Equal(true)) - Expect(*aksMachine.Properties.Kubernetes.KubeletConfig.ImageGcHighThreshold).To(Equal(int32(85))) - Expect(*aksMachine.Properties.Kubernetes.KubeletConfig.ImageGcLowThreshold).To(Equal(int32(80))) - Expect(lo.FromPtr(aksMachine.Properties.Kubernetes.KubeletConfig.FailSwapOn)).To(BeFalse()) - - // Verify image family configuration - Expect(string(*aksMachine.Properties.OperatingSystem.OSSKU)).To(Equal(v1beta1.Ubuntu2204ImageFamily)) - - // Verify subnet configuration (AKS machine should use the specified custom subnet) - Expect(aksMachine.Properties.Network).ToNot(BeNil()) - Expect(aksMachine.Properties.Network.VnetSubnetID).ToNot(BeNil()) - Expect(*aksMachine.Properties.Network.VnetSubnetID).To(Equal(testSubnetID)) - - // Verify custom tags from NodeClass - Expect(aksMachine.Properties.Tags).To(HaveKey("custom-tag")) - Expect(*aksMachine.Properties.Tags["custom-tag"]).To(Equal("custom-value")) - Expect(aksMachine.Properties.Tags).To(HaveKey("environment")) - Expect(*aksMachine.Properties.Tags["environment"]).To(Equal("test")) - Expect(aksMachine.Properties.Tags).To(HaveKey("team")) - Expect(*aksMachine.Properties.Tags["team"]).To(Equal("platform")) - - // Verify Karpenter-managed tags are still present and correct - Expect(aksMachine.Properties.Tags).To(HaveKey("karpenter.sh_nodepool")) - Expect(aksMachine.Properties.Tags["karpenter.sh_nodepool"]).To(Equal(&nodePool.Name)) - Expect(aksMachine.Properties.Tags).To(HaveKey("karpenter.azure.com_cluster")) - Expect(aksMachine.Properties.Tags["karpenter.azure.com_cluster"]).To(Equal(&testOptions.ClusterName)) - Expect(aksMachine.Properties.Tags).To(HaveKey("compute.aks.billing")) - Expect(aksMachine.Properties.Tags).To(HaveKey("karpenter.azure.com_aksmachine_nodeclaim")) - - // Verify OS disk size configuration - Expect(aksMachine.Properties.OperatingSystem).ToNot(BeNil()) - Expect(aksMachine.Properties.OperatingSystem.OSDiskSizeGB).ToNot(BeNil()) - Expect(*aksMachine.Properties.OperatingSystem.OSDiskSizeGB).To(Equal(int32(100))) - - // Verify GPU node was selected (machine should be GPU-capable) - Expect(aksMachine.Properties.Hardware).ToNot(BeNil()) - Expect(aksMachine.Properties.Hardware.VMSize).ToNot(BeNil()) - vmSize := *aksMachine.Properties.Hardware.VMSize - Expect(utils.IsNvidiaEnabledSKU(vmSize)).To(BeTrue()) - - // Verify image selection - NodeImageVersion should be set correctly - Expect(aksMachine.Properties.NodeImageVersion).ToNot(BeNil()) - Expect(*aksMachine.Properties.NodeImageVersion).To(MatchRegexp(`^AKSUbuntu-.*-.*$`)) + // Select an instance type that supports the disk size + nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, karpv1.NodeSelectorRequirementWithMinValues{ + Key: v1.LabelInstanceTypeStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{"Standard_D64s_v3"}, }) - It("should handle configured NodeClaim", func() { - nodeClaim.Spec.Taints = []v1.Taint{ - {Key: "test-taint", Value: "test-value", Effect: v1.TaintEffectNoSchedule}, - } - nodeClaim.Spec.StartupTaints = []v1.Taint{ - {Key: "startup-taint", Value: "startup-value", Effect: v1.TaintEffectNoExecute}, - } - - ExpectApplied(ctx, env.Client, nodePool, nodeClass, nodeClaim) - _, err := CreateAndWaitForPromises(ctx, cloudProvider, azureEnv, nodeClaim) - Expect(err).ToNot(HaveOccurred()) - - // Verify machine was created with correct taints - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - input := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - machine := input.AKSMachine + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + + // Verify AKS machine was created with correct OS disk size + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine + Expect(aksMachine.Properties.OperatingSystem).ToNot(BeNil()) + Expect(aksMachine.Properties.OperatingSystem.OSDiskSizeGB).ToNot(BeNil()) + Expect(*aksMachine.Properties.OperatingSystem.OSDiskSizeGB).To(Equal(int32(256))) + Expect(*aksMachine.Properties.OperatingSystem.OSDiskType).To(Equal(armcontainerservice.OSDiskTypeEphemeral)) + }) - // Check that taints are configured - // Currently, we will use "nodeInitializationTaints" field for all taints. More details in the relevant code (aksmachineinstancehelpers.go). - Expect(machine.Properties.Kubernetes.NodeInitializationTaints).To(ContainElement(lo.ToPtr("test-taint=test-value:NoSchedule"))) - Expect(machine.Properties.Kubernetes.NodeInitializationTaints).To(ContainElement(lo.ToPtr("startup-taint=startup-value:NoExecute"))) + // Ported from VM test: "should not use ephemeral disk if ephemeral is supported, but we don't have enough space" + It("should not use ephemeral disk if ephemeral is supported, but we don't have enough space", func() { + // Select Standard_D2s_v3 which supports ephemeral but has limited space + // Standard_D2s_V3 has 53GB Of CacheDisk space and 16GB of Temp Disk Space. + // With our rule of 128GB being the minimum OSDiskSize, this should fall back to managed disk + nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, karpv1.NodeSelectorRequirementWithMinValues{ + Key: v1.LabelInstanceTypeStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{"Standard_D2s_v3"}, }) - It("should not allow the user to override Karpenter-managed tags", func() { - nodeClass.Spec.Tags = map[string]string{ - "karpenter.azure.com/cluster": "my-override-cluster", - "karpenter.sh/nodepool": "my-override-nodepool", - } - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - - // Verify AKS machine was created with correct Karpenter-managed tags (not user overrides) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - input := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := input.AKSMachine - - // Check that AKS machine has correct Karpenter-managed tags - Expect(aksMachine.Properties.Tags).To(HaveKey("karpenter.sh_nodepool")) - Expect(aksMachine.Properties.Tags["karpenter.sh_nodepool"]).To(Equal(&nodePool.Name)) - Expect(aksMachine.Properties.Tags).To(HaveKey("karpenter.azure.com_cluster")) - Expect(aksMachine.Properties.Tags["karpenter.azure.com_cluster"]).To(Equal(&testOptions.ClusterName)) - - // Verify user-specified tags are ignored for Karpenter-managed keys - Expect(*aksMachine.Properties.Tags["karpenter.sh_nodepool"]).ToNot(Equal("my-override-nodepool")) - Expect(*aksMachine.Properties.Tags["karpenter.azure.com_cluster"]).ToNot(Equal("my-override-cluster")) - }) + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + + // Should fall back to managed disk due to insufficient ephemeral space + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine + Expect(aksMachine.Properties.OperatingSystem).ToNot(BeNil()) + Expect(aksMachine.Properties.OperatingSystem.OSDiskType).ToNot(BeNil()) + Expect(*aksMachine.Properties.OperatingSystem.OSDiskType).To(Equal(armcontainerservice.OSDiskTypeManaged)) + Expect(aksMachine.Properties.OperatingSystem.OSDiskSizeGB).ToNot(BeNil()) + Expect(*aksMachine.Properties.OperatingSystem.OSDiskSizeGB).To(Equal(int32(128))) // Default size }) + }) - // Ported from VM test: "EncryptionAtHost" - Context("Create - EncryptionAtHost", func() { - It("should create AKS machine with EncryptionAtHost enabled when specified in AKSNodeClass", func() { - if nodeClass.Spec.Security == nil { - nodeClass.Spec.Security = &v1beta1.Security{} - } - nodeClass.Spec.Security.EncryptionAtHost = lo.ToPtr(true) - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) - - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine - - Expect(aksMachine.Properties.Security).ToNot(BeNil()) - Expect(aksMachine.Properties.Security.EnableEncryptionAtHost).ToNot(BeNil()) - Expect(lo.FromPtr(aksMachine.Properties.Security.EnableEncryptionAtHost)).To(BeTrue()) + Context("Create - Additional Configurations", func() { + It("should handle configured NodeClass", func() { + // Configure comprehensive NodeClass settings + nodeClass.Spec.Kubelet = &v1beta1.KubeletConfiguration{ + CPUManagerPolicy: lo.ToPtr("static"), + CPUCFSQuota: lo.ToPtr(true), + ImageGCHighThresholdPercent: lo.ToPtr(int32(85)), + ImageGCLowThresholdPercent: lo.ToPtr(int32(80)), + FailSwapOn: lo.ToPtr(false), + } + nodeClass.Spec.ImageFamily = lo.ToPtr(v1beta1.Ubuntu2204ImageFamily) + + // Override context to use a BYO VNet instead of managed VNet + // This allows testing custom subnet configuration (managed VNet doesn't allow custom subnets) + byoClusterSubnetID := "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resourceGroup/providers/Microsoft.Network/virtualNetworks/byo-vnet-customname/subnets/cluster-subnet" + byoOpts := test.Options(test.OptionsFields{ + ProvisionMode: lo.ToPtr(consts.ProvisionModeAKSMachineAPIHeaderBatch), + UseSIG: lo.ToPtr(true), + SubnetID: lo.ToPtr(byoClusterSubnetID), + }) + byoCtx := options.ToContext(ctx, byoOpts) + + // Extract cluster subnet components and create a test subnet in the same VNet + clusterSubnetComponents, err := utils.GetVnetSubnetIDComponents(byoClusterSubnetID) + Expect(err).ToNot(HaveOccurred()) + testSubnetID := fmt.Sprintf("/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Network/virtualNetworks/%s/subnets/nodeclass-subnet", + clusterSubnetComponents.SubscriptionID, clusterSubnetComponents.ResourceGroupName, clusterSubnetComponents.VNetName) + nodeClass.Spec.VNETSubnetID = lo.ToPtr(testSubnetID) + nodeClass.Spec.Tags = map[string]string{ + "custom-tag": "custom-value", + "environment": "test", + "team": "platform", + } + nodeClass.Spec.OSDiskSizeGB = lo.ToPtr(int32(100)) + + // Configure GPU workload to test GPU node selection + pod := coretest.UnschedulablePod(coretest.PodOptions{ + ResourceRequirements: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + "nvidia.com/gpu": resource.MustParse("1"), + }, + Limits: v1.ResourceList{ + "nvidia.com/gpu": resource.MustParse("1"), + }, + }, }) - It("should create AKS machine with EncryptionAtHost disabled when specified in AKSNodeClass", func() { - if nodeClass.Spec.Security == nil { - nodeClass.Spec.Security = &v1beta1.Security{} - } - nodeClass.Spec.Security.EncryptionAtHost = lo.ToPtr(false) - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) - - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) + ExpectApplied(byoCtx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(byoCtx, env.Client, statusController, nodeClass) + ExpectProvisionedAndWaitForPromises(byoCtx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(byoCtx, env.Client, pod) + + // Verify AKS machine was created + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + input := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := input.AKSMachine + + // Verify kubelet configuration + Expect(aksMachine.Properties.Kubernetes.KubeletConfig).ToNot(BeNil()) + Expect(*aksMachine.Properties.Kubernetes.KubeletConfig.CPUManagerPolicy).To(Equal("static")) + Expect(*aksMachine.Properties.Kubernetes.KubeletConfig.CPUCfsQuota).To(Equal(true)) + Expect(*aksMachine.Properties.Kubernetes.KubeletConfig.ImageGcHighThreshold).To(Equal(int32(85))) + Expect(*aksMachine.Properties.Kubernetes.KubeletConfig.ImageGcLowThreshold).To(Equal(int32(80))) + Expect(lo.FromPtr(aksMachine.Properties.Kubernetes.KubeletConfig.FailSwapOn)).To(BeFalse()) + + // Verify image family configuration + Expect(string(*aksMachine.Properties.OperatingSystem.OSSKU)).To(Equal(v1beta1.Ubuntu2204ImageFamily)) + + // Verify subnet configuration (AKS machine should use the specified custom subnet) + Expect(aksMachine.Properties.Network).ToNot(BeNil()) + Expect(aksMachine.Properties.Network.VnetSubnetID).ToNot(BeNil()) + Expect(*aksMachine.Properties.Network.VnetSubnetID).To(Equal(testSubnetID)) + + // Verify custom tags from NodeClass + Expect(aksMachine.Properties.Tags).To(HaveKey("custom-tag")) + Expect(*aksMachine.Properties.Tags["custom-tag"]).To(Equal("custom-value")) + Expect(aksMachine.Properties.Tags).To(HaveKey("environment")) + Expect(*aksMachine.Properties.Tags["environment"]).To(Equal("test")) + Expect(aksMachine.Properties.Tags).To(HaveKey("team")) + Expect(*aksMachine.Properties.Tags["team"]).To(Equal("platform")) + + // Verify Karpenter-managed tags are still present and correct + Expect(aksMachine.Properties.Tags).To(HaveKey("karpenter.sh_nodepool")) + Expect(aksMachine.Properties.Tags["karpenter.sh_nodepool"]).To(Equal(&nodePool.Name)) + Expect(aksMachine.Properties.Tags).To(HaveKey("karpenter.azure.com_cluster")) + Expect(aksMachine.Properties.Tags["karpenter.azure.com_cluster"]).To(Equal(&testOptions.ClusterName)) + Expect(aksMachine.Properties.Tags).To(HaveKey("compute.aks.billing")) + Expect(aksMachine.Properties.Tags).To(HaveKey("karpenter.azure.com_aksmachine_nodeclaim")) + + // Verify OS disk size configuration + Expect(aksMachine.Properties.OperatingSystem).ToNot(BeNil()) + Expect(aksMachine.Properties.OperatingSystem.OSDiskSizeGB).ToNot(BeNil()) + Expect(*aksMachine.Properties.OperatingSystem.OSDiskSizeGB).To(Equal(int32(100))) + + // Verify GPU node was selected (machine should be GPU-capable) + Expect(aksMachine.Properties.Hardware).ToNot(BeNil()) + Expect(aksMachine.Properties.Hardware.VMSize).ToNot(BeNil()) + vmSize := *aksMachine.Properties.Hardware.VMSize + Expect(utils.IsNvidiaEnabledSKU(vmSize)).To(BeTrue()) + + // Verify image selection - NodeImageVersion should be set correctly + Expect(aksMachine.Properties.NodeImageVersion).ToNot(BeNil()) + Expect(*aksMachine.Properties.NodeImageVersion).To(MatchRegexp(`^AKSUbuntu-.*-.*$`)) + }) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine + It("should handle configured NodeClaim", func() { + nodeClaim.Spec.Taints = []v1.Taint{ + {Key: "test-taint", Value: "test-value", Effect: v1.TaintEffectNoSchedule}, + } + nodeClaim.Spec.StartupTaints = []v1.Taint{ + {Key: "startup-taint", Value: "startup-value", Effect: v1.TaintEffectNoExecute}, + } + + ExpectApplied(ctx, env.Client, nodePool, nodeClass, nodeClaim) + _, err := CreateAndWaitForPromises(ctx, cloudProvider, azureEnv, nodeClaim) + Expect(err).ToNot(HaveOccurred()) + + // Verify machine was created with correct taints + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + input := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + machine := input.AKSMachine + + // Check that taints are configured + // Currently, we will use "nodeInitializationTaints" field for all taints. More details in the relevant code (aksmachineinstancehelpers.go). + Expect(machine.Properties.Kubernetes.NodeInitializationTaints).To(ContainElement(lo.ToPtr("test-taint=test-value:NoSchedule"))) + Expect(machine.Properties.Kubernetes.NodeInitializationTaints).To(ContainElement(lo.ToPtr("startup-taint=startup-value:NoExecute"))) + }) - Expect(aksMachine.Properties.Security).ToNot(BeNil()) - Expect(aksMachine.Properties.Security.EnableEncryptionAtHost).ToNot(BeNil()) - Expect(lo.FromPtr(aksMachine.Properties.Security.EnableEncryptionAtHost)).To(BeFalse()) - }) + It("should not allow the user to override Karpenter-managed tags", func() { + nodeClass.Spec.Tags = map[string]string{ + "karpenter.azure.com/cluster": "my-override-cluster", + "karpenter.sh/nodepool": "my-override-nodepool", + } + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + + // Verify AKS machine was created with correct Karpenter-managed tags (not user overrides) + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + input := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := input.AKSMachine + + // Check that AKS machine has correct Karpenter-managed tags + Expect(aksMachine.Properties.Tags).To(HaveKey("karpenter.sh_nodepool")) + Expect(aksMachine.Properties.Tags["karpenter.sh_nodepool"]).To(Equal(&nodePool.Name)) + Expect(aksMachine.Properties.Tags).To(HaveKey("karpenter.azure.com_cluster")) + Expect(aksMachine.Properties.Tags["karpenter.azure.com_cluster"]).To(Equal(&testOptions.ClusterName)) + + // Verify user-specified tags are ignored for Karpenter-managed keys + Expect(*aksMachine.Properties.Tags["karpenter.sh_nodepool"]).ToNot(Equal("my-override-nodepool")) + Expect(*aksMachine.Properties.Tags["karpenter.azure.com_cluster"]).ToNot(Equal("my-override-cluster")) + }) + }) - It("should create AKS machine with EncryptionAtHost disabled when not specified in AKSNodeClass", func() { - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + // Ported from VM test: "EncryptionAtHost" + Context("Create - EncryptionAtHost", func() { + It("should create AKS machine with EncryptionAtHost enabled when specified in AKSNodeClass", func() { + if nodeClass.Spec.Security == nil { + nodeClass.Spec.Security = &v1beta1.Security{} + } + nodeClass.Spec.Security.EncryptionAtHost = lo.ToPtr(true) + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) + pod := coretest.UnschedulablePod(coretest.PodOptions{}) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := createInput.AKSMachine - // Security profile should still exist but EncryptionAtHost should be false (default) - Expect(aksMachine.Properties.Security).ToNot(BeNil()) - Expect(aksMachine.Properties.Security.EnableEncryptionAtHost).ToNot(BeNil()) - Expect(lo.FromPtr(aksMachine.Properties.Security.EnableEncryptionAtHost)).To(BeFalse()) - }) + Expect(aksMachine.Properties.Security).ToNot(BeNil()) + Expect(aksMachine.Properties.Security.EnableEncryptionAtHost).ToNot(BeNil()) + Expect(lo.FromPtr(aksMachine.Properties.Security.EnableEncryptionAtHost)).To(BeTrue()) }) - // Labels in the kubernetes.io/k8s.io domains were previously restricted by Karpenter core (<1.9.x) - // and are now allowed on NodeClaims. However, kubelet cannot set most of them, so they should be - // filtered out of AKS Machine NodeLabels (same as the VM path). Karpenter syncs them to the Node - // directly, so they still appear on the Node object. - DescribeTable("should handle previously reserved labels on AKS Machine create", - func(label string, expectedInNodeLabels bool) { - nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, - karpv1.NodeSelectorRequirementWithMinValues{Key: label, Operator: v1.NodeSelectorOpIn, Values: []string{"custom-value"}}, - ) - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - - pod := coretest.UnschedulablePod(coretest.PodOptions{NodeSelector: map[string]string{label: "custom-value"}}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - node := ExpectScheduled(ctx, env.Client, pod) + It("should create AKS machine with EncryptionAtHost disabled when specified in AKSNodeClass", func() { + if nodeClass.Spec.Security == nil { + nodeClass.Spec.Security = &v1beta1.Security{} + } + nodeClass.Spec.Security.EncryptionAtHost = lo.ToPtr(false) + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) - // Label should always be on the Node (synced by Karpenter) - Expect(node.Labels).To(HaveKeyWithValue(label, "custom-value")) + pod := coretest.UnschedulablePod(coretest.PodOptions{}) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine - Expect(aksMachine.Properties.Kubernetes).ToNot(BeNil()) + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := createInput.AKSMachine - if expectedInNodeLabels { - Expect(aksMachine.Properties.Kubernetes.NodeLabels).To(HaveKeyWithValue(label, lo.ToPtr("custom-value"))) - } else { - Expect(aksMachine.Properties.Kubernetes.NodeLabels).ToNot(HaveKey(label)) - } - }, - Entry("kubernetes.io (previously reserved)", "kubernetes.io/custom-label", false), - Entry("k8s.io (previously reserved)", "k8s.io/custom-label", false), - Entry("kubelet.kubernetes.io (kubelet-allowed)", "kubelet.kubernetes.io/custom-label", true), - ) + Expect(aksMachine.Properties.Security).ToNot(BeNil()) + Expect(aksMachine.Properties.Security.EnableEncryptionAtHost).ToNot(BeNil()) + Expect(lo.FromPtr(aksMachine.Properties.Security.EnableEncryptionAtHost)).To(BeFalse()) + }) - Context("Create - LinuxOSConfig", func() { - It("should create AKS machine with full LinuxOSConfig when specified in AKSNodeClass", func() { - nodeClass.Spec.Kubelet = &v1beta1.KubeletConfiguration{ - FailSwapOn: lo.ToPtr(false), - } - nodeClass.Spec.LinuxOSConfig = &v1beta1.LinuxOSConfiguration{ - SwapFileSize: lo.ToPtr("1500Mi"), - TransparentHugePageDefrag: lo.ToPtr(v1beta1.TransparentHugePageDefragMadvise), - TransparentHugePageEnabled: lo.ToPtr(v1beta1.TransparentHugePageEnabledAlways), - Sysctls: &v1beta1.SysctlConfiguration{ - FsAioMaxNr: lo.ToPtr(int32(65536)), - FsFileMax: lo.ToPtr(int32(12000)), - FsInotifyMaxUserWatches: lo.ToPtr(int32(781250)), - FsNrOpen: lo.ToPtr(int32(8192)), - KernelThreadsMax: lo.ToPtr(int32(30000)), - NetCoreNetdevMaxBacklog: lo.ToPtr(int32(1000)), - NetCoreOptmemMax: lo.ToPtr(int32(20480)), - NetCoreRmemDefault: lo.ToPtr(int32(212992)), - NetCoreRmemMax: lo.ToPtr(int32(212992)), - NetCoreSomaxconn: lo.ToPtr(int32(4096)), - NetCoreWmemDefault: lo.ToPtr(int32(212992)), - NetCoreWmemMax: lo.ToPtr(int32(212992)), - NetIPv4IPLocalPortRange: lo.ToPtr("32768 60999"), - NetIPv4NeighDefaultGcThresh1: lo.ToPtr(int32(128)), - NetIPv4NeighDefaultGcThresh2: lo.ToPtr(int32(512)), - NetIPv4NeighDefaultGcThresh3: lo.ToPtr(int32(1024)), - NetIPv4TCPFinTimeout: lo.ToPtr(int32(60)), - NetIPv4TCPKeepaliveProbes: lo.ToPtr(int32(9)), - NetIPv4TCPKeepaliveTime: lo.ToPtr(int32(7200)), - NetIPv4TCPMaxSynBacklog: lo.ToPtr(int32(128)), - NetIPv4TCPMaxTwBuckets: lo.ToPtr(int32(8000)), - NetIPv4TCPTwReuse: lo.ToPtr(true), - NetIPv4TCPKeepaliveIntvl: lo.ToPtr(int32(75)), - NetNetfilterNfConntrackBuckets: lo.ToPtr(int32(65536)), - NetNetfilterNfConntrackMax: lo.ToPtr(int32(131072)), - VMMaxMapCount: lo.ToPtr(int32(65530)), - VMSwappiness: lo.ToPtr(int32(60)), - VMVfsCachePressure: lo.ToPtr(int32(100)), - }, - } - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + It("should create AKS machine with EncryptionAtHost disabled when not specified in AKSNodeClass", func() { + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) + pod := coretest.UnschedulablePod(coretest.PodOptions{}) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := createInput.AKSMachine - Expect(aksMachine.Properties.OperatingSystem).ToNot(BeNil()) - Expect(aksMachine.Properties.OperatingSystem.LinuxProfile).ToNot(BeNil()) - linuxOSConfig := aksMachine.Properties.OperatingSystem.LinuxProfile.LinuxOSConfig - Expect(linuxOSConfig).ToNot(BeNil()) - - // Verify top-level fields - Expect(lo.FromPtr(linuxOSConfig.SwapFileSizeMB)).To(Equal(int32(1500))) - Expect(lo.FromPtr(linuxOSConfig.TransparentHugePageDefrag)).To(Equal("madvise")) - Expect(lo.FromPtr(linuxOSConfig.TransparentHugePageEnabled)).To(Equal("always")) - - // Verify failSwapOn was wired through to kubelet config - Expect(aksMachine.Properties.Kubernetes.KubeletConfig).ToNot(BeNil()) - Expect(lo.FromPtr(aksMachine.Properties.Kubernetes.KubeletConfig.FailSwapOn)).To(BeFalse()) - - // Verify sysctl fields - Expect(linuxOSConfig.Sysctls).ToNot(BeNil()) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.FsAioMaxNr)).To(Equal(int32(65536))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.FsFileMax)).To(Equal(int32(12000))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.FsInotifyMaxUserWatches)).To(Equal(int32(781250))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.FsNrOpen)).To(Equal(int32(8192))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.KernelThreadsMax)).To(Equal(int32(30000))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetCoreNetdevMaxBacklog)).To(Equal(int32(1000))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetCoreOptmemMax)).To(Equal(int32(20480))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetCoreRmemDefault)).To(Equal(int32(212992))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetCoreRmemMax)).To(Equal(int32(212992))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetCoreSomaxconn)).To(Equal(int32(4096))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetCoreWmemDefault)).To(Equal(int32(212992))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetCoreWmemMax)).To(Equal(int32(212992))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetIPv4IPLocalPortRange)).To(Equal("32768 60999")) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetIPv4NeighDefaultGcThresh1)).To(Equal(int32(128))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetIPv4NeighDefaultGcThresh2)).To(Equal(int32(512))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetIPv4NeighDefaultGcThresh3)).To(Equal(int32(1024))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetIPv4TCPFinTimeout)).To(Equal(int32(60))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetIPv4TCPKeepaliveProbes)).To(Equal(int32(9))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetIPv4TCPKeepaliveTime)).To(Equal(int32(7200))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetIPv4TCPMaxSynBacklog)).To(Equal(int32(128))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetIPv4TCPMaxTwBuckets)).To(Equal(int32(8000))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetIPv4TCPTwReuse)).To(BeTrue()) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetIPv4TcpkeepaliveIntvl)).To(Equal(int32(75))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetNetfilterNfConntrackBuckets)).To(Equal(int32(65536))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetNetfilterNfConntrackMax)).To(Equal(int32(131072))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.VMMaxMapCount)).To(Equal(int32(65530))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.VMSwappiness)).To(Equal(int32(60))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.VMVfsCachePressure)).To(Equal(int32(100))) - }) + // Security profile should still exist but EncryptionAtHost should be false (default) + Expect(aksMachine.Properties.Security).ToNot(BeNil()) + Expect(aksMachine.Properties.Security.EnableEncryptionAtHost).ToNot(BeNil()) + Expect(lo.FromPtr(aksMachine.Properties.Security.EnableEncryptionAtHost)).To(BeFalse()) + }) + }) - It("should create AKS machine with only sysctls when only sysctls are specified", func() { - nodeClass.Spec.LinuxOSConfig = &v1beta1.LinuxOSConfiguration{ - Sysctls: &v1beta1.SysctlConfiguration{ - VMMaxMapCount: lo.ToPtr(int32(262144)), - VMSwappiness: lo.ToPtr(int32(10)), - }, - } - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + // Labels in the kubernetes.io/k8s.io domains were previously restricted by Karpenter core (<1.9.x) + // and are now allowed on NodeClaims. However, kubelet cannot set most of them, so they should be + // filtered out of AKS Machine NodeLabels (same as the VM path). Karpenter syncs them to the Node + // directly, so they still appear on the Node object. + DescribeTable("should handle previously reserved labels on AKS Machine create", + func(label string, expectedInNodeLabels bool) { + nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, + karpv1.NodeSelectorRequirementWithMinValues{Key: label, Operator: v1.NodeSelectorOpIn, Values: []string{"custom-value"}}, + ) + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + + pod := coretest.UnschedulablePod(coretest.PodOptions{NodeSelector: map[string]string{label: "custom-value"}}) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + node := ExpectScheduled(ctx, env.Client, pod) + + // Label should always be on the Node (synced by Karpenter) + Expect(node.Labels).To(HaveKeyWithValue(label, "custom-value")) + + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := createInput.AKSMachine + Expect(aksMachine.Properties.Kubernetes).ToNot(BeNil()) + + if expectedInNodeLabels { + Expect(aksMachine.Properties.Kubernetes.NodeLabels).To(HaveKeyWithValue(label, lo.ToPtr("custom-value"))) + } else { + Expect(aksMachine.Properties.Kubernetes.NodeLabels).ToNot(HaveKey(label)) + } + }, + Entry("kubernetes.io (previously reserved)", "kubernetes.io/custom-label", false), + Entry("k8s.io (previously reserved)", "k8s.io/custom-label", false), + Entry("kubelet.kubernetes.io (kubelet-allowed)", "kubelet.kubernetes.io/custom-label", true), + ) + + Context("Create - LinuxOSConfig", func() { + It("should create AKS machine with full LinuxOSConfig when specified in AKSNodeClass", func() { + nodeClass.Spec.Kubelet = &v1beta1.KubeletConfiguration{ + FailSwapOn: lo.ToPtr(false), + } + nodeClass.Spec.LinuxOSConfig = &v1beta1.LinuxOSConfiguration{ + SwapFileSize: lo.ToPtr("1500Mi"), + TransparentHugePageDefrag: lo.ToPtr(v1beta1.TransparentHugePageDefragMadvise), + TransparentHugePageEnabled: lo.ToPtr(v1beta1.TransparentHugePageEnabledAlways), + Sysctls: &v1beta1.SysctlConfiguration{ + FsAioMaxNr: lo.ToPtr(int32(65536)), + FsFileMax: lo.ToPtr(int32(12000)), + FsInotifyMaxUserWatches: lo.ToPtr(int32(781250)), + FsNrOpen: lo.ToPtr(int32(8192)), + KernelThreadsMax: lo.ToPtr(int32(30000)), + NetCoreNetdevMaxBacklog: lo.ToPtr(int32(1000)), + NetCoreOptmemMax: lo.ToPtr(int32(20480)), + NetCoreRmemDefault: lo.ToPtr(int32(212992)), + NetCoreRmemMax: lo.ToPtr(int32(212992)), + NetCoreSomaxconn: lo.ToPtr(int32(4096)), + NetCoreWmemDefault: lo.ToPtr(int32(212992)), + NetCoreWmemMax: lo.ToPtr(int32(212992)), + NetIPv4IPLocalPortRange: lo.ToPtr("32768 60999"), + NetIPv4NeighDefaultGcThresh1: lo.ToPtr(int32(128)), + NetIPv4NeighDefaultGcThresh2: lo.ToPtr(int32(512)), + NetIPv4NeighDefaultGcThresh3: lo.ToPtr(int32(1024)), + NetIPv4TCPFinTimeout: lo.ToPtr(int32(60)), + NetIPv4TCPKeepaliveProbes: lo.ToPtr(int32(9)), + NetIPv4TCPKeepaliveTime: lo.ToPtr(int32(7200)), + NetIPv4TCPMaxSynBacklog: lo.ToPtr(int32(128)), + NetIPv4TCPMaxTwBuckets: lo.ToPtr(int32(8000)), + NetIPv4TCPTwReuse: lo.ToPtr(true), + NetIPv4TCPKeepaliveIntvl: lo.ToPtr(int32(75)), + NetNetfilterNfConntrackBuckets: lo.ToPtr(int32(65536)), + NetNetfilterNfConntrackMax: lo.ToPtr(int32(131072)), + VMMaxMapCount: lo.ToPtr(int32(65530)), + VMSwappiness: lo.ToPtr(int32(60)), + VMVfsCachePressure: lo.ToPtr(int32(100)), + }, + } + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) + pod := coretest.UnschedulablePod(coretest.PodOptions{}) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := createInput.AKSMachine + + Expect(aksMachine.Properties.OperatingSystem).ToNot(BeNil()) + Expect(aksMachine.Properties.OperatingSystem.LinuxProfile).ToNot(BeNil()) + linuxOSConfig := aksMachine.Properties.OperatingSystem.LinuxProfile.LinuxOSConfig + Expect(linuxOSConfig).ToNot(BeNil()) + + // Verify top-level fields + Expect(lo.FromPtr(linuxOSConfig.SwapFileSizeMB)).To(Equal(int32(1500))) + Expect(lo.FromPtr(linuxOSConfig.TransparentHugePageDefrag)).To(Equal("madvise")) + Expect(lo.FromPtr(linuxOSConfig.TransparentHugePageEnabled)).To(Equal("always")) + + // Verify failSwapOn was wired through to kubelet config + Expect(aksMachine.Properties.Kubernetes.KubeletConfig).ToNot(BeNil()) + Expect(lo.FromPtr(aksMachine.Properties.Kubernetes.KubeletConfig.FailSwapOn)).To(BeFalse()) + + // Verify sysctl fields + Expect(linuxOSConfig.Sysctls).ToNot(BeNil()) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.FsAioMaxNr)).To(Equal(int32(65536))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.FsFileMax)).To(Equal(int32(12000))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.FsInotifyMaxUserWatches)).To(Equal(int32(781250))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.FsNrOpen)).To(Equal(int32(8192))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.KernelThreadsMax)).To(Equal(int32(30000))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetCoreNetdevMaxBacklog)).To(Equal(int32(1000))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetCoreOptmemMax)).To(Equal(int32(20480))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetCoreRmemDefault)).To(Equal(int32(212992))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetCoreRmemMax)).To(Equal(int32(212992))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetCoreSomaxconn)).To(Equal(int32(4096))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetCoreWmemDefault)).To(Equal(int32(212992))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetCoreWmemMax)).To(Equal(int32(212992))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetIPv4IPLocalPortRange)).To(Equal("32768 60999")) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetIPv4NeighDefaultGcThresh1)).To(Equal(int32(128))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetIPv4NeighDefaultGcThresh2)).To(Equal(int32(512))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetIPv4NeighDefaultGcThresh3)).To(Equal(int32(1024))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetIPv4TCPFinTimeout)).To(Equal(int32(60))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetIPv4TCPKeepaliveProbes)).To(Equal(int32(9))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetIPv4TCPKeepaliveTime)).To(Equal(int32(7200))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetIPv4TCPMaxSynBacklog)).To(Equal(int32(128))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetIPv4TCPMaxTwBuckets)).To(Equal(int32(8000))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetIPv4TCPTwReuse)).To(BeTrue()) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetIPv4TcpkeepaliveIntvl)).To(Equal(int32(75))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetNetfilterNfConntrackBuckets)).To(Equal(int32(65536))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetNetfilterNfConntrackMax)).To(Equal(int32(131072))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.VMMaxMapCount)).To(Equal(int32(65530))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.VMSwappiness)).To(Equal(int32(60))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.VMVfsCachePressure)).To(Equal(int32(100))) + }) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine + It("should create AKS machine with only sysctls when only sysctls are specified", func() { + nodeClass.Spec.LinuxOSConfig = &v1beta1.LinuxOSConfiguration{ + Sysctls: &v1beta1.SysctlConfiguration{ + VMMaxMapCount: lo.ToPtr(int32(262144)), + VMSwappiness: lo.ToPtr(int32(10)), + }, + } + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) - Expect(aksMachine.Properties.OperatingSystem.LinuxProfile).ToNot(BeNil()) - linuxOSConfig := aksMachine.Properties.OperatingSystem.LinuxProfile.LinuxOSConfig - Expect(linuxOSConfig).ToNot(BeNil()) + pod := coretest.UnschedulablePod(coretest.PodOptions{}) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) - // Top-level fields should be nil - Expect(linuxOSConfig.SwapFileSizeMB).To(BeNil()) - Expect(linuxOSConfig.TransparentHugePageDefrag).To(BeNil()) - Expect(linuxOSConfig.TransparentHugePageEnabled).To(BeNil()) + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := createInput.AKSMachine - // Sysctls should be set - Expect(linuxOSConfig.Sysctls).ToNot(BeNil()) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.VMMaxMapCount)).To(Equal(int32(262144))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.VMSwappiness)).To(Equal(int32(10))) + Expect(aksMachine.Properties.OperatingSystem.LinuxProfile).ToNot(BeNil()) + linuxOSConfig := aksMachine.Properties.OperatingSystem.LinuxProfile.LinuxOSConfig + Expect(linuxOSConfig).ToNot(BeNil()) - // Other sysctls should be nil - Expect(linuxOSConfig.Sysctls.FsAioMaxNr).To(BeNil()) - }) + // Top-level fields should be nil + Expect(linuxOSConfig.SwapFileSizeMB).To(BeNil()) + Expect(linuxOSConfig.TransparentHugePageDefrag).To(BeNil()) + Expect(linuxOSConfig.TransparentHugePageEnabled).To(BeNil()) - It("should create AKS machine with only TransparentHugePage settings when only TransparentHugePage is specified", func() { - nodeClass.Spec.LinuxOSConfig = &v1beta1.LinuxOSConfiguration{ - TransparentHugePageEnabled: lo.ToPtr(v1beta1.TransparentHugePageEnabledNever), - TransparentHugePageDefrag: lo.ToPtr(v1beta1.TransparentHugePageDefragDefer), - } - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + // Sysctls should be set + Expect(linuxOSConfig.Sysctls).ToNot(BeNil()) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.VMMaxMapCount)).To(Equal(int32(262144))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.VMSwappiness)).To(Equal(int32(10))) - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) + // Other sysctls should be nil + Expect(linuxOSConfig.Sysctls.FsAioMaxNr).To(BeNil()) + }) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine + It("should create AKS machine with only TransparentHugePage settings when only TransparentHugePage is specified", func() { + nodeClass.Spec.LinuxOSConfig = &v1beta1.LinuxOSConfiguration{ + TransparentHugePageEnabled: lo.ToPtr(v1beta1.TransparentHugePageEnabledNever), + TransparentHugePageDefrag: lo.ToPtr(v1beta1.TransparentHugePageDefragDefer), + } + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) - Expect(aksMachine.Properties.OperatingSystem.LinuxProfile).ToNot(BeNil()) - linuxOSConfig := aksMachine.Properties.OperatingSystem.LinuxProfile.LinuxOSConfig - Expect(linuxOSConfig).ToNot(BeNil()) + pod := coretest.UnschedulablePod(coretest.PodOptions{}) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) - Expect(lo.FromPtr(linuxOSConfig.TransparentHugePageEnabled)).To(Equal("never")) - Expect(lo.FromPtr(linuxOSConfig.TransparentHugePageDefrag)).To(Equal("defer")) - Expect(linuxOSConfig.SwapFileSizeMB).To(BeNil()) - Expect(linuxOSConfig.Sysctls).To(BeNil()) - }) + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := createInput.AKSMachine - It("should create AKS machine with only SwapFileSize when only swap is specified", func() { - nodeClass.Spec.Kubelet = &v1beta1.KubeletConfiguration{ - FailSwapOn: lo.ToPtr(false), - } - nodeClass.Spec.LinuxOSConfig = &v1beta1.LinuxOSConfiguration{ - SwapFileSize: lo.ToPtr("2Gi"), - } - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + Expect(aksMachine.Properties.OperatingSystem.LinuxProfile).ToNot(BeNil()) + linuxOSConfig := aksMachine.Properties.OperatingSystem.LinuxProfile.LinuxOSConfig + Expect(linuxOSConfig).ToNot(BeNil()) - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) + Expect(lo.FromPtr(linuxOSConfig.TransparentHugePageEnabled)).To(Equal("never")) + Expect(lo.FromPtr(linuxOSConfig.TransparentHugePageDefrag)).To(Equal("defer")) + Expect(linuxOSConfig.SwapFileSizeMB).To(BeNil()) + Expect(linuxOSConfig.Sysctls).To(BeNil()) + }) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine + It("should create AKS machine with only SwapFileSize when only swap is specified", func() { + nodeClass.Spec.Kubelet = &v1beta1.KubeletConfiguration{ + FailSwapOn: lo.ToPtr(false), + } + nodeClass.Spec.LinuxOSConfig = &v1beta1.LinuxOSConfiguration{ + SwapFileSize: lo.ToPtr("2Gi"), + } + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) - Expect(aksMachine.Properties.OperatingSystem.LinuxProfile).ToNot(BeNil()) - linuxOSConfig := aksMachine.Properties.OperatingSystem.LinuxProfile.LinuxOSConfig - Expect(linuxOSConfig).ToNot(BeNil()) - Expect(lo.FromPtr(linuxOSConfig.SwapFileSizeMB)).To(Equal(int32(2048))) - Expect(linuxOSConfig.TransparentHugePageDefrag).To(BeNil()) - Expect(linuxOSConfig.TransparentHugePageEnabled).To(BeNil()) - Expect(linuxOSConfig.Sysctls).To(BeNil()) - }) + pod := coretest.UnschedulablePod(coretest.PodOptions{}) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := createInput.AKSMachine + + Expect(aksMachine.Properties.OperatingSystem.LinuxProfile).ToNot(BeNil()) + linuxOSConfig := aksMachine.Properties.OperatingSystem.LinuxProfile.LinuxOSConfig + Expect(linuxOSConfig).ToNot(BeNil()) + Expect(lo.FromPtr(linuxOSConfig.SwapFileSizeMB)).To(Equal(int32(2048))) + Expect(linuxOSConfig.TransparentHugePageDefrag).To(BeNil()) + Expect(linuxOSConfig.TransparentHugePageEnabled).To(BeNil()) + Expect(linuxOSConfig.Sysctls).To(BeNil()) + }) - It("should create AKS machine without LinuxProfile when LinuxOSConfig is not specified", func() { - // Explicitly ensure LinuxOSConfig is not set - nodeClass.Spec.LinuxOSConfig = nil - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + It("should create AKS machine without LinuxProfile when LinuxOSConfig is not specified", func() { + // Explicitly ensure LinuxOSConfig is not set + nodeClass.Spec.LinuxOSConfig = nil + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) + pod := coretest.UnschedulablePod(coretest.PodOptions{}) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := createInput.AKSMachine - Expect(aksMachine.Properties.OperatingSystem).ToNot(BeNil()) - Expect(aksMachine.Properties.OperatingSystem.LinuxProfile).To(BeNil()) - }) + Expect(aksMachine.Properties.OperatingSystem).ToNot(BeNil()) + Expect(aksMachine.Properties.OperatingSystem.LinuxProfile).To(BeNil()) }) + }) - Context("Create - ArtifactStreaming", func() { - It("should set ArtifactStreamingProfile when explicitly enabled", func() { - nodeClass.Spec.ArtifactStreaming = &v1beta1.ArtifactStreaming{ - Enabled: lo.ToPtr(true), - } - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + Context("Create - ArtifactStreaming", func() { + It("should set ArtifactStreamingProfile when explicitly enabled", func() { + nodeClass.Spec.ArtifactStreaming = &v1beta1.ArtifactStreaming{ + Enabled: lo.ToPtr(true), + } + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) + pod := coretest.UnschedulablePod(coretest.PodOptions{}) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := createInput.AKSMachine - Expect(aksMachine.Properties.Kubernetes).ToNot(BeNil()) - Expect(aksMachine.Properties.Kubernetes.ArtifactStreamingProfile).ToNot(BeNil()) - Expect(lo.FromPtr(aksMachine.Properties.Kubernetes.ArtifactStreamingProfile.Enabled)).To(BeTrue()) - }) + Expect(aksMachine.Properties.Kubernetes).ToNot(BeNil()) + Expect(aksMachine.Properties.Kubernetes.ArtifactStreamingProfile).ToNot(BeNil()) + Expect(lo.FromPtr(aksMachine.Properties.Kubernetes.ArtifactStreamingProfile.Enabled)).To(BeTrue()) + }) - It("should not set ArtifactStreamingProfile when not specified (defaults to disabled)", func() { - nodeClass.Spec.ArtifactStreaming = nil - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + It("should not set ArtifactStreamingProfile when not specified (defaults to disabled)", func() { + nodeClass.Spec.ArtifactStreaming = nil + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) + pod := coretest.UnschedulablePod(coretest.PodOptions{}) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := createInput.AKSMachine - Expect(aksMachine.Properties.Kubernetes).ToNot(BeNil()) - Expect(aksMachine.Properties.Kubernetes.ArtifactStreamingProfile).To(BeNil()) - }) + Expect(aksMachine.Properties.Kubernetes).ToNot(BeNil()) + Expect(aksMachine.Properties.Kubernetes.ArtifactStreamingProfile).To(BeNil()) + }) - It("should not set ArtifactStreamingProfile when explicitly disabled", func() { - nodeClass.Spec.ArtifactStreaming = &v1beta1.ArtifactStreaming{ - Enabled: lo.ToPtr(false), - } - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + It("should not set ArtifactStreamingProfile when explicitly disabled", func() { + nodeClass.Spec.ArtifactStreaming = &v1beta1.ArtifactStreaming{ + Enabled: lo.ToPtr(false), + } + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) + pod := coretest.UnschedulablePod(coretest.PodOptions{}) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := createInput.AKSMachine - Expect(aksMachine.Properties.Kubernetes).ToNot(BeNil()) - Expect(aksMachine.Properties.Kubernetes.ArtifactStreamingProfile).To(BeNil()) - }) + Expect(aksMachine.Properties.Kubernetes).ToNot(BeNil()) + Expect(aksMachine.Properties.Kubernetes.ArtifactStreamingProfile).To(BeNil()) + }) - It("should not set ArtifactStreamingProfile for ARM64 instance types even when enabled", func() { - nodeClass.Spec.ArtifactStreaming = &v1beta1.ArtifactStreaming{ - Enabled: lo.ToPtr(true), - } - // ARM64 does not support artifact streaming; IsArtifactStreamingEnabled returns false for arm64. - // Verify through the NodeClass API directly since the test environment may not have ARM64 instance types. - Expect(nodeClass.IsArtifactStreamingEnabled("arm64")).To(BeFalse()) - Expect(nodeClass.IsArtifactStreamingEnabled("amd64")).To(BeTrue()) - }) + It("should not set ArtifactStreamingProfile for ARM64 instance types even when enabled", func() { + nodeClass.Spec.ArtifactStreaming = &v1beta1.ArtifactStreaming{ + Enabled: lo.ToPtr(true), + } + // ARM64 does not support artifact streaming; IsArtifactStreamingEnabled returns false for arm64. + // Verify through the NodeClass API directly since the test environment may not have ARM64 instance types. + Expect(nodeClass.IsArtifactStreamingEnabled("arm64")).To(BeFalse()) + Expect(nodeClass.IsArtifactStreamingEnabled("amd64")).To(BeTrue()) }) + }) - Context("Create - LocalDNS", func() { - It("should set LocalDNSProfile with mode Required", func() { - nodeClass.Spec.LocalDNS = &v1beta1.LocalDNS{ - Mode: v1beta1.LocalDNSModeRequired, - VnetDNSOverrides: validLocalDNSOverridePair(v1beta1.LocalDNSForwardDestinationVnetDNS), - KubeDNSOverrides: validLocalDNSOverridePair(v1beta1.LocalDNSForwardDestinationClusterCoreDNS), - } - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + Context("Create - LocalDNS", func() { + It("should set LocalDNSProfile with mode Required", func() { + nodeClass.Spec.LocalDNS = &v1beta1.LocalDNS{ + Mode: v1beta1.LocalDNSModeRequired, + VnetDNSOverrides: validLocalDNSOverridePair(v1beta1.LocalDNSForwardDestinationVnetDNS), + KubeDNSOverrides: validLocalDNSOverridePair(v1beta1.LocalDNSForwardDestinationClusterCoreDNS), + } + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) + pod := coretest.UnschedulablePod(coretest.PodOptions{}) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := createInput.AKSMachine - Expect(aksMachine.Properties.LocalDNSProfile).ToNot(BeNil()) - Expect(lo.FromPtr(aksMachine.Properties.LocalDNSProfile.Mode)).To(Equal(armcontainerservice.LocalDNSModeRequired)) - Expect(aksMachine.Properties.LocalDNSProfile.VnetDNSOverrides).To(HaveLen(2)) - Expect(aksMachine.Properties.LocalDNSProfile.KubeDNSOverrides).To(HaveLen(2)) - }) + Expect(aksMachine.Properties.LocalDNSProfile).ToNot(BeNil()) + Expect(lo.FromPtr(aksMachine.Properties.LocalDNSProfile.Mode)).To(Equal(armcontainerservice.LocalDNSModeRequired)) + Expect(aksMachine.Properties.LocalDNSProfile.VnetDNSOverrides).To(HaveLen(2)) + Expect(aksMachine.Properties.LocalDNSProfile.KubeDNSOverrides).To(HaveLen(2)) + }) - It("should not set LocalDNSProfile when LocalDNS is nil", func() { - nodeClass.Spec.LocalDNS = nil - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + It("should not set LocalDNSProfile when LocalDNS is nil", func() { + nodeClass.Spec.LocalDNS = nil + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) + pod := coretest.UnschedulablePod(coretest.PodOptions{}) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := createInput.AKSMachine - Expect(aksMachine.Properties.LocalDNSProfile).To(BeNil()) - }) + Expect(aksMachine.Properties.LocalDNSProfile).To(BeNil()) + }) - It("should correctly convert override fields including durations", func() { - nodeClass.Spec.LocalDNS = &v1beta1.LocalDNS{ - Mode: v1beta1.LocalDNSModeRequired, - VnetDNSOverrides: []v1beta1.LocalDNSZoneOverride{ - { - Zone: ".", - ForwardDestination: v1beta1.LocalDNSForwardDestinationVnetDNS, - QueryLogging: v1beta1.LocalDNSQueryLoggingLog, - Protocol: v1beta1.LocalDNSProtocolForceTCP, - ForwardPolicy: v1beta1.LocalDNSForwardPolicyRoundRobin, - MaxConcurrent: lo.ToPtr(int32(50)), - CacheDuration: karpv1.MustParseNillableDuration("30s"), - ServeStaleDuration: karpv1.MustParseNillableDuration("60s"), - ServeStale: v1beta1.LocalDNSServeStaleImmediate, - }, - { - Zone: "cluster.local", - ForwardDestination: v1beta1.LocalDNSForwardDestinationClusterCoreDNS, - QueryLogging: v1beta1.LocalDNSQueryLoggingLog, - Protocol: v1beta1.LocalDNSProtocolPreferUDP, - ForwardPolicy: v1beta1.LocalDNSForwardPolicySequential, - MaxConcurrent: lo.ToPtr(int32(10)), - CacheDuration: karpv1.MustParseNillableDuration("10s"), - ServeStaleDuration: karpv1.MustParseNillableDuration("5s"), - ServeStale: v1beta1.LocalDNSServeStaleVerify, - }, + It("should correctly convert override fields including durations", func() { + nodeClass.Spec.LocalDNS = &v1beta1.LocalDNS{ + Mode: v1beta1.LocalDNSModeRequired, + VnetDNSOverrides: []v1beta1.LocalDNSZoneOverride{ + { + Zone: ".", + ForwardDestination: v1beta1.LocalDNSForwardDestinationVnetDNS, + QueryLogging: v1beta1.LocalDNSQueryLoggingLog, + Protocol: v1beta1.LocalDNSProtocolForceTCP, + ForwardPolicy: v1beta1.LocalDNSForwardPolicyRoundRobin, + MaxConcurrent: lo.ToPtr(int32(50)), + CacheDuration: karpv1.MustParseNillableDuration("30s"), + ServeStaleDuration: karpv1.MustParseNillableDuration("60s"), + ServeStale: v1beta1.LocalDNSServeStaleImmediate, }, - KubeDNSOverrides: validLocalDNSOverridePair(v1beta1.LocalDNSForwardDestinationClusterCoreDNS), - } - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) - - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine + { + Zone: "cluster.local", + ForwardDestination: v1beta1.LocalDNSForwardDestinationClusterCoreDNS, + QueryLogging: v1beta1.LocalDNSQueryLoggingLog, + Protocol: v1beta1.LocalDNSProtocolPreferUDP, + ForwardPolicy: v1beta1.LocalDNSForwardPolicySequential, + MaxConcurrent: lo.ToPtr(int32(10)), + CacheDuration: karpv1.MustParseNillableDuration("10s"), + ServeStaleDuration: karpv1.MustParseNillableDuration("5s"), + ServeStale: v1beta1.LocalDNSServeStaleVerify, + }, + }, + KubeDNSOverrides: validLocalDNSOverridePair(v1beta1.LocalDNSForwardDestinationClusterCoreDNS), + } + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) - Expect(aksMachine.Properties.LocalDNSProfile).ToNot(BeNil()) - - vnetOverride := aksMachine.Properties.LocalDNSProfile.VnetDNSOverrides["."] - Expect(vnetOverride).ToNot(BeNil()) - Expect(lo.FromPtr(vnetOverride.ForwardDestination)).To(Equal(armcontainerservice.LocalDNSForwardDestinationVnetDNS)) - Expect(lo.FromPtr(vnetOverride.QueryLogging)).To(Equal(armcontainerservice.LocalDNSQueryLoggingLog)) - Expect(lo.FromPtr(vnetOverride.Protocol)).To(Equal(armcontainerservice.LocalDNSProtocolForceTCP)) - Expect(lo.FromPtr(vnetOverride.ForwardPolicy)).To(Equal(armcontainerservice.LocalDNSForwardPolicyRoundRobin)) - Expect(lo.FromPtr(vnetOverride.MaxConcurrent)).To(Equal(int32(50))) - Expect(lo.FromPtr(vnetOverride.CacheDurationInSeconds)).To(Equal(int32(30))) - Expect(lo.FromPtr(vnetOverride.ServeStaleDurationInSeconds)).To(Equal(int32(60))) - Expect(lo.FromPtr(vnetOverride.ServeStale)).To(Equal(armcontainerservice.LocalDNSServeStaleImmediate)) - }) + pod := coretest.UnschedulablePod(coretest.PodOptions{}) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := createInput.AKSMachine + + Expect(aksMachine.Properties.LocalDNSProfile).ToNot(BeNil()) + + vnetOverride := aksMachine.Properties.LocalDNSProfile.VnetDNSOverrides["."] + Expect(vnetOverride).ToNot(BeNil()) + Expect(lo.FromPtr(vnetOverride.ForwardDestination)).To(Equal(armcontainerservice.LocalDNSForwardDestinationVnetDNS)) + Expect(lo.FromPtr(vnetOverride.QueryLogging)).To(Equal(armcontainerservice.LocalDNSQueryLoggingLog)) + Expect(lo.FromPtr(vnetOverride.Protocol)).To(Equal(armcontainerservice.LocalDNSProtocolForceTCP)) + Expect(lo.FromPtr(vnetOverride.ForwardPolicy)).To(Equal(armcontainerservice.LocalDNSForwardPolicyRoundRobin)) + Expect(lo.FromPtr(vnetOverride.MaxConcurrent)).To(Equal(int32(50))) + Expect(lo.FromPtr(vnetOverride.CacheDurationInSeconds)).To(Equal(int32(30))) + Expect(lo.FromPtr(vnetOverride.ServeStaleDurationInSeconds)).To(Equal(int32(60))) + Expect(lo.FromPtr(vnetOverride.ServeStale)).To(Equal(armcontainerservice.LocalDNSServeStaleImmediate)) + }) - It("should set LocalDNSProfile with mode Disabled", func() { - nodeClass.Spec.LocalDNS = &v1beta1.LocalDNS{ - Mode: v1beta1.LocalDNSModeDisabled, - VnetDNSOverrides: validLocalDNSOverridePair(v1beta1.LocalDNSForwardDestinationVnetDNS), - KubeDNSOverrides: validLocalDNSOverridePair(v1beta1.LocalDNSForwardDestinationClusterCoreDNS), - } - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + It("should set LocalDNSProfile with mode Disabled", func() { + nodeClass.Spec.LocalDNS = &v1beta1.LocalDNS{ + Mode: v1beta1.LocalDNSModeDisabled, + VnetDNSOverrides: validLocalDNSOverridePair(v1beta1.LocalDNSForwardDestinationVnetDNS), + KubeDNSOverrides: validLocalDNSOverridePair(v1beta1.LocalDNSForwardDestinationClusterCoreDNS), + } + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) + pod := coretest.UnschedulablePod(coretest.PodOptions{}) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := createInput.AKSMachine - Expect(aksMachine.Properties.LocalDNSProfile).ToNot(BeNil()) - Expect(lo.FromPtr(aksMachine.Properties.LocalDNSProfile.Mode)).To(Equal(armcontainerservice.LocalDNSModeDisabled)) - }) + Expect(aksMachine.Properties.LocalDNSProfile).ToNot(BeNil()) + Expect(lo.FromPtr(aksMachine.Properties.LocalDNSProfile.Mode)).To(Equal(armcontainerservice.LocalDNSModeDisabled)) + }) - It("should rewrite Preferred to Required on the wire when Status.LocalDNSState=Enabled", func() { - // Preferred is never sent downstream — Karpenter is the only kube-aware - // resolver, so ResolvedLocalDNSForWire rewrites Mode to the terminal - // value implied by Status.LocalDNSState. Enabled => Required. - nodeClass.Spec.LocalDNS = &v1beta1.LocalDNS{ - Mode: v1beta1.LocalDNSModePreferred, - VnetDNSOverrides: validLocalDNSOverridePair(v1beta1.LocalDNSForwardDestinationVnetDNS), - KubeDNSOverrides: validLocalDNSOverridePair(v1beta1.LocalDNSForwardDestinationClusterCoreDNS), - } - nodeClass.Status.LocalDNSState = lo.ToPtr(v1beta1.LocalDNSStateEnabled) - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + It("should rewrite Preferred to Required on the wire when Status.LocalDNSState=Enabled", func() { + // Preferred is never sent downstream — Karpenter is the only kube-aware + // resolver, so ResolvedLocalDNSForWire rewrites Mode to the terminal + // value implied by Status.LocalDNSState. Enabled => Required. + nodeClass.Spec.LocalDNS = &v1beta1.LocalDNS{ + Mode: v1beta1.LocalDNSModePreferred, + VnetDNSOverrides: validLocalDNSOverridePair(v1beta1.LocalDNSForwardDestinationVnetDNS), + KubeDNSOverrides: validLocalDNSOverridePair(v1beta1.LocalDNSForwardDestinationClusterCoreDNS), + } + nodeClass.Status.LocalDNSState = lo.ToPtr(v1beta1.LocalDNSStateEnabled) + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) + pod := coretest.UnschedulablePod(coretest.PodOptions{}) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := createInput.AKSMachine - Expect(aksMachine.Properties.LocalDNSProfile).ToNot(BeNil()) - Expect(lo.FromPtr(aksMachine.Properties.LocalDNSProfile.Mode)).To(Equal(armcontainerservice.LocalDNSModeRequired)) - }) + Expect(aksMachine.Properties.LocalDNSProfile).ToNot(BeNil()) + Expect(lo.FromPtr(aksMachine.Properties.LocalDNSProfile.Mode)).To(Equal(armcontainerservice.LocalDNSModeRequired)) + }) - It("should rewrite Preferred to Disabled on the wire when Status.LocalDNSState is unset", func() { - // Defense-in-depth: if Status hasn't been resolved yet, never pass - // Preferred downstream — the downstream resolver cannot see cluster - // gates and would re-decide incorrectly. Fall back to Disabled. - nodeClass.Spec.LocalDNS = &v1beta1.LocalDNS{ - Mode: v1beta1.LocalDNSModePreferred, - VnetDNSOverrides: validLocalDNSOverridePair(v1beta1.LocalDNSForwardDestinationVnetDNS), - KubeDNSOverrides: validLocalDNSOverridePair(v1beta1.LocalDNSForwardDestinationClusterCoreDNS), - } - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) - // The status sub-reconciler resolves Preferred to Enabled in this - // test env (no cluster conflicts). Wipe LocalDNSState back to nil - // via a status Patch to drive the "Status not yet resolved" - // branch of ResolvedLocalDNSForWire. Re-fetch first because the - // reconcile bumped the resource version. - Expect(env.Client.Get(ctx, client.ObjectKeyFromObject(nodeClass), nodeClass)).To(Succeed()) - stored := nodeClass.DeepCopy() - nodeClass.Status.LocalDNSState = nil - Expect(env.Client.Status().Patch(ctx, nodeClass, client.MergeFrom(stored))).To(Succeed()) + It("should rewrite Preferred to Disabled on the wire when Status.LocalDNSState is unset", func() { + // Defense-in-depth: if Status hasn't been resolved yet, never pass + // Preferred downstream — the downstream resolver cannot see cluster + // gates and would re-decide incorrectly. Fall back to Disabled. + nodeClass.Spec.LocalDNS = &v1beta1.LocalDNS{ + Mode: v1beta1.LocalDNSModePreferred, + VnetDNSOverrides: validLocalDNSOverridePair(v1beta1.LocalDNSForwardDestinationVnetDNS), + KubeDNSOverrides: validLocalDNSOverridePair(v1beta1.LocalDNSForwardDestinationClusterCoreDNS), + } + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + // The status sub-reconciler resolves Preferred to Enabled in this + // test env (no cluster conflicts). Wipe LocalDNSState back to nil + // via a status Patch to drive the "Status not yet resolved" + // branch of ResolvedLocalDNSForWire. Re-fetch first because the + // reconcile bumped the resource version. + Expect(env.Client.Get(ctx, client.ObjectKeyFromObject(nodeClass), nodeClass)).To(Succeed()) + stored := nodeClass.DeepCopy() + nodeClass.Status.LocalDNSState = nil + Expect(env.Client.Status().Patch(ctx, nodeClass, client.MergeFrom(stored))).To(Succeed()) + + pod := coretest.UnschedulablePod(coretest.PodOptions{}) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := createInput.AKSMachine + + Expect(aksMachine.Properties.LocalDNSProfile).ToNot(BeNil()) + Expect(lo.FromPtr(aksMachine.Properties.LocalDNSProfile.Mode)).To(Equal(armcontainerservice.LocalDNSModeDisabled)) + }) - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) + It("should correctly convert KubeDNSOverrides field values", func() { + nodeClass.Spec.LocalDNS = &v1beta1.LocalDNS{ + Mode: v1beta1.LocalDNSModeRequired, + VnetDNSOverrides: validLocalDNSOverridePair(v1beta1.LocalDNSForwardDestinationVnetDNS), + KubeDNSOverrides: []v1beta1.LocalDNSZoneOverride{ + { + Zone: ".", + ForwardDestination: v1beta1.LocalDNSForwardDestinationClusterCoreDNS, + QueryLogging: v1beta1.LocalDNSQueryLoggingLog, + Protocol: v1beta1.LocalDNSProtocolPreferUDP, + ForwardPolicy: v1beta1.LocalDNSForwardPolicySequential, + MaxConcurrent: lo.ToPtr(int32(25)), + CacheDuration: karpv1.MustParseNillableDuration("15s"), + ServeStaleDuration: karpv1.MustParseNillableDuration("45s"), + ServeStale: v1beta1.LocalDNSServeStaleVerify, + }, + validLocalDNSZoneOverride("cluster.local", v1beta1.LocalDNSForwardDestinationClusterCoreDNS), + }, + } + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine + pod := coretest.UnschedulablePod(coretest.PodOptions{}) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := createInput.AKSMachine + + Expect(aksMachine.Properties.LocalDNSProfile).ToNot(BeNil()) + Expect(aksMachine.Properties.LocalDNSProfile.KubeDNSOverrides).To(HaveLen(2)) + + kubeOverride := aksMachine.Properties.LocalDNSProfile.KubeDNSOverrides["."] + Expect(kubeOverride).ToNot(BeNil()) + Expect(lo.FromPtr(kubeOverride.ForwardDestination)).To(Equal(armcontainerservice.LocalDNSForwardDestinationClusterCoreDNS)) + Expect(lo.FromPtr(kubeOverride.QueryLogging)).To(Equal(armcontainerservice.LocalDNSQueryLoggingLog)) + Expect(lo.FromPtr(kubeOverride.Protocol)).To(Equal(armcontainerservice.LocalDNSProtocolPreferUDP)) + Expect(lo.FromPtr(kubeOverride.ForwardPolicy)).To(Equal(armcontainerservice.LocalDNSForwardPolicySequential)) + Expect(lo.FromPtr(kubeOverride.MaxConcurrent)).To(Equal(int32(25))) + Expect(lo.FromPtr(kubeOverride.CacheDurationInSeconds)).To(Equal(int32(15))) + Expect(lo.FromPtr(kubeOverride.ServeStaleDurationInSeconds)).To(Equal(int32(45))) + Expect(lo.FromPtr(kubeOverride.ServeStale)).To(Equal(armcontainerservice.LocalDNSServeStaleVerify)) + }) + }) +} - Expect(aksMachine.Properties.LocalDNSProfile).ToNot(BeNil()) - Expect(lo.FromPtr(aksMachine.Properties.LocalDNSProfile.Mode)).To(Equal(armcontainerservice.LocalDNSModeDisabled)) +var _ = Describe("CloudProvider", func() { + Context("ProvisionMode = AKSMachineAPIHeaderBatch", func() { + BeforeEach(func() { + testOptions = test.Options(test.OptionsFields{ + ProvisionMode: lo.ToPtr(consts.ProvisionModeAKSMachineAPIHeaderBatch), + UseSIG: lo.ToPtr(true), }) - It("should correctly convert KubeDNSOverrides field values", func() { - nodeClass.Spec.LocalDNS = &v1beta1.LocalDNS{ - Mode: v1beta1.LocalDNSModeRequired, - VnetDNSOverrides: validLocalDNSOverridePair(v1beta1.LocalDNSForwardDestinationVnetDNS), - KubeDNSOverrides: []v1beta1.LocalDNSZoneOverride{ - { - Zone: ".", - ForwardDestination: v1beta1.LocalDNSForwardDestinationClusterCoreDNS, - QueryLogging: v1beta1.LocalDNSQueryLoggingLog, - Protocol: v1beta1.LocalDNSProtocolPreferUDP, - ForwardPolicy: v1beta1.LocalDNSForwardPolicySequential, - MaxConcurrent: lo.ToPtr(int32(25)), - CacheDuration: karpv1.MustParseNillableDuration("15s"), - ServeStaleDuration: karpv1.MustParseNillableDuration("45s"), - ServeStale: v1beta1.LocalDNSServeStaleVerify, - }, - validLocalDNSZoneOverride("cluster.local", v1beta1.LocalDNSForwardDestinationClusterCoreDNS), - }, - } - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + ctx = coreoptions.ToContext(ctx, coretest.Options()) + ctx = options.ToContext(ctx, testOptions) - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) + azureEnv = test.NewEnvironment(ctx, env) + azureEnvNonZonal = test.NewEnvironmentNonZonal(ctx, env) + statusController = status.NewController(env.Client, azureEnv.KubernetesVersionProvider, azureEnv.ImageProvider, env.KubernetesInterface, env.KubernetesInterface, azureEnv.DynamicInterface, azureEnv.SubnetsAPI, azureEnv.DiskEncryptionSetsAPI, testOptions.ParsedDiskEncryptionSetID, options.FromContext(ctx).NetworkPolicy, options.FromContext(ctx).NetworkPlugin) + test.ApplyDefaultStatus(nodeClass, env, testOptions.UseSIG) + cloudProvider = New(azureEnv.InstanceTypesProvider, azureEnv.VMInstanceProvider, azureEnv.AKSMachineProvider, recorder, env.Client, azureEnv.ImageProvider, azureEnv.InstanceTypeStore) + cloudProviderNonZonal = New(azureEnvNonZonal.InstanceTypesProvider, azureEnvNonZonal.VMInstanceProvider, azureEnvNonZonal.AKSMachineProvider, events.NewRecorder(&record.FakeRecorder{}), env.Client, azureEnvNonZonal.ImageProvider, azureEnvNonZonal.InstanceTypeStore) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine + cluster = state.NewCluster(fakeClock, env.Client, cloudProvider) + clusterNonZonal = state.NewCluster(fakeClock, env.Client, cloudProviderNonZonal) + coreProvisioner = provisioning.NewProvisioner(env.Client, recorder, cloudProvider, cluster, fakeClock) + coreProvisionerNonZonal = provisioning.NewProvisioner(env.Client, recorder, cloudProviderNonZonal, clusterNonZonal, fakeClock) - Expect(aksMachine.Properties.LocalDNSProfile).ToNot(BeNil()) - Expect(aksMachine.Properties.LocalDNSProfile.KubeDNSOverrides).To(HaveLen(2)) - - kubeOverride := aksMachine.Properties.LocalDNSProfile.KubeDNSOverrides["."] - Expect(kubeOverride).ToNot(BeNil()) - Expect(lo.FromPtr(kubeOverride.ForwardDestination)).To(Equal(armcontainerservice.LocalDNSForwardDestinationClusterCoreDNS)) - Expect(lo.FromPtr(kubeOverride.QueryLogging)).To(Equal(armcontainerservice.LocalDNSQueryLoggingLog)) - Expect(lo.FromPtr(kubeOverride.Protocol)).To(Equal(armcontainerservice.LocalDNSProtocolPreferUDP)) - Expect(lo.FromPtr(kubeOverride.ForwardPolicy)).To(Equal(armcontainerservice.LocalDNSForwardPolicySequential)) - Expect(lo.FromPtr(kubeOverride.MaxConcurrent)).To(Equal(int32(25))) - Expect(lo.FromPtr(kubeOverride.CacheDurationInSeconds)).To(Equal(int32(15))) - Expect(lo.FromPtr(kubeOverride.ServeStaleDurationInSeconds)).To(Equal(int32(45))) - Expect(lo.FromPtr(kubeOverride.ServeStale)).To(Equal(armcontainerservice.LocalDNSServeStaleVerify)) - }) + ExpectApplied(ctx, env.Client, nodeClass, nodePool) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + }) + + AfterEach(func() { + // Wait for any async polling goroutines to complete before resetting + cloudProvider.WaitForInstancePromises() + cluster.Reset() + azureEnv.Reset(ctx) + azureEnvNonZonal.Reset(ctx) }) + runAKSMachineAPIFeatureTests() }) }) diff --git a/pkg/cloudprovider/suite_integration_test.go b/pkg/cloudprovider/suite_integration_test.go index cd3db17b8..a04bedf42 100644 --- a/pkg/cloudprovider/suite_integration_test.go +++ b/pkg/cloudprovider/suite_integration_test.go @@ -57,100 +57,66 @@ func validateAKSMachineNodeClaim(nodeClaim *karpv1.NodeClaim, nodePool *karpv1.N Expect(nodeClaim.Annotations[v1beta1.AnnotationAKSMachineResourceID]).ToNot(BeEmpty()) } -// runSharedAKSMachineAPITests contains the common test cases that should be run -// for both ManageExistingAKSMachines = true and false configurations -func runSharedAKSMachineAPITests() { +func runSharedProvisionModeIntegrationTests(provisionMode provisionModeTestCase) { It("should be able to handle basic operations", func() { ExpectApplied(ctx, env.Client, nodeClass, nodePool) - // List should return nothing - azureEnv.AKSMachinesAPI.AKSMachineNewListPagerBehavior.CalledWithInput.Reset() - azureEnv.AzureResourceGraphAPI.AzureResourceGraphResourcesBehavior.CalledWithInput.Reset() + provisionMode.resetListCalls() nodeClaims, err := cloudProvider.List(ctx) Expect(err).ToNot(HaveOccurred()) Expect(nodeClaims).To(BeEmpty()) - Expect(azureEnv.AKSMachinesAPI.AKSMachineNewListPagerBehavior.CalledWithInput.Len()).To(Equal(1)) - Expect(azureEnv.AzureResourceGraphAPI.AzureResourceGraphResourcesBehavior.CalledWithInput.Len()).To(Equal(1)) // Expect to be called in case of existing VMs - Expect(azureEnv.AKSAgentPoolsAPI.AgentPoolGetBehavior.CalledWithInput.Len()).To(Equal(0)) // No unnecessary checks + provisionMode.expectListCalls() - // Scale-up 1 node - azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Reset() - azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Reset() + provisionMode.resetCreateCalls() pod := coretest.UnschedulablePod() ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) ExpectScheduled(ctx, env.Client, pod) - //// Should call AKS Machine APIs instead of VM APIs - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(0)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - Expect(createInput.AKSMachine.Properties).ToNot(BeNil()) + provisionMode.expectCreateCalls() + provisionMode.expectCreatedResource() - // List should return the created nodeclaim - azureEnv.AKSMachinesAPI.AKSMachineNewListPagerBehavior.CalledWithInput.Reset() - azureEnv.AzureResourceGraphAPI.AzureResourceGraphResourcesBehavior.CalledWithInput.Reset() + provisionMode.resetListCalls() nodeClaims, err = cloudProvider.List(ctx) Expect(err).ToNot(HaveOccurred()) - Expect(azureEnv.AKSMachinesAPI.AKSMachineNewListPagerBehavior.CalledWithInput.Len()).To(Equal(1)) - Expect(azureEnv.AzureResourceGraphAPI.AzureResourceGraphResourcesBehavior.CalledWithInput.Len()).To(Equal(1)) // Expect to be called in case of existing VMs + provisionMode.expectListCalls() - //// The returned nodeClaim should be correct Expect(nodeClaims).To(HaveLen(1)) createdNodeClaim := nodeClaims[0] - validateAKSMachineNodeClaim(createdNodeClaim, nodePool) + provisionMode.validateNodeClaim(createdNodeClaim) - // Get should return the created nodeClaim - azureEnv.AKSMachinesAPI.AKSMachineGetBehavior.CalledWithInput.Reset() - azureEnv.VirtualMachinesAPI.VirtualMachineGetBehavior.CalledWithInput.Reset() + provisionMode.resetGetCalls() retrievedNodeClaim, err := cloudProvider.Get(ctx, createdNodeClaim.Status.ProviderID) Expect(err).ToNot(HaveOccurred()) - Expect(azureEnv.AKSMachinesAPI.AKSMachineGetBehavior.CalledWithInput.Len()).To(Equal(1)) - Expect(azureEnv.VirtualMachinesAPI.VirtualMachineGetBehavior.CalledWithInput.Len()).To(Equal(0)) // Should not be bothered + provisionMode.expectGetCalls() - //// The returned nodeClaim should be correct - validateAKSMachineNodeClaim(retrievedNodeClaim, nodePool) + provisionMode.validateNodeClaim(retrievedNodeClaim) - // Delete - azureEnv.AKSAgentPoolsAPI.AgentPoolDeleteMachinesBehavior.CalledWithInput.Reset() - azureEnv.VirtualMachinesAPI.VirtualMachineDeleteBehavior.CalledWithInput.Reset() + provisionMode.resetDeleteCalls() Expect(cloudProvider.Delete(ctx, retrievedNodeClaim)).To(Succeed()) - Expect(azureEnv.AKSAgentPoolsAPI.AgentPoolDeleteMachinesBehavior.CalledWithInput.Len()).To(Equal(1)) - Expect(azureEnv.VirtualMachinesAPI.VirtualMachineDeleteBehavior.CalledWithInput.Len()).To(Equal(0)) // Should not be bothered + provisionMode.expectDeleteCalls() - //// List should return no nodeclaims - azureEnv.AKSMachinesAPI.AKSMachineNewListPagerBehavior.CalledWithInput.Reset() - azureEnv.AzureResourceGraphAPI.AzureResourceGraphResourcesBehavior.CalledWithInput.Reset() + provisionMode.resetListCalls() nodeClaims, err = cloudProvider.List(ctx) Expect(err).ToNot(HaveOccurred()) - Expect(azureEnv.AKSMachinesAPI.AKSMachineNewListPagerBehavior.CalledWithInput.Len()).To(Equal(1)) - Expect(azureEnv.AzureResourceGraphAPI.AzureResourceGraphResourcesBehavior.CalledWithInput.Len()).To(Equal(1)) // Expect to be called + provisionMode.expectListCalls() Expect(nodeClaims).To(BeEmpty()) - //// Get should return NodeClaimNotFound error - azureEnv.AKSMachinesAPI.AKSMachineGetBehavior.CalledWithInput.Reset() - azureEnv.VirtualMachinesAPI.VirtualMachineGetBehavior.CalledWithInput.Reset() + provisionMode.resetGetCalls() nodeClaim, err = cloudProvider.Get(ctx, createdNodeClaim.Status.ProviderID) Expect(err).To(HaveOccurred()) Expect(corecloudprovider.IsNodeClaimNotFoundError(err)).To(BeTrue()) - Expect(azureEnv.AKSMachinesAPI.AKSMachineGetBehavior.CalledWithInput.Len()).To(Equal(1)) - Expect(azureEnv.VirtualMachinesAPI.VirtualMachineGetBehavior.CalledWithInput.Len()).To(Equal(0)) // Should not be bothered + provisionMode.expectGetCalls() Expect(nodeClaim).To(BeNil()) }) runNodeOverlayCapacityTests(nodeOverlayCapacityTestOptions{ - validateNodeClaim: func(nodeClaim *karpv1.NodeClaim) { - validateAKSMachineNodeClaim(nodeClaim, nodePool) - }, - resetCreateCalls: func() { - azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Reset() - azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Reset() - }, - expectCreateCalls: func() { - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(0)) - }, + validateNodeClaim: provisionMode.validateNodeClaim, + resetCreateCalls: provisionMode.resetCreateCalls, + expectCreateCalls: provisionMode.expectCreateCalls, }) +} +func runAKSMachineAPIIntegrationTests() { // XPMT: TODO(comtalyst): deep inspection test on simulating all of these? Context("Unexpected API Failures", func() { It("should handle AKS machine create failures - unrecognized error during sync/initial", func() { @@ -627,8 +593,9 @@ var _ = Describe("CloudProvider", func() { azureEnvNonZonal.Reset(ctx) }) - // Run shared AKS Machine API tests - runSharedAKSMachineAPITests() + // Run shared provision-mode tests + runSharedProvisionModeIntegrationTests(aksMachineAPIHeaderBatchProvisionMode()) + runAKSMachineAPIIntegrationTests() }) Context("ProvisionMode = AKSMachineAPIHeaderBatch, ManageExistingAKSMachines = true", func() { @@ -666,8 +633,9 @@ var _ = Describe("CloudProvider", func() { azureEnvNonZonal.Reset(ctx) }) - // Run shared AKS Machine API tests - runSharedAKSMachineAPITests() + // Run shared provision-mode tests + runSharedProvisionModeIntegrationTests(aksMachineAPIHeaderBatchProvisionMode()) + runAKSMachineAPIIntegrationTests() }) Context("Mixed Environment - Migration from ProvisionMode = AKSMachineAPIHeaderBatch to VM mode", func() { diff --git a/pkg/cloudprovider/suite_modes_test.go b/pkg/cloudprovider/suite_modes_test.go new file mode 100644 index 000000000..c0a4fa201 --- /dev/null +++ b/pkg/cloudprovider/suite_modes_test.go @@ -0,0 +1,81 @@ +/* +Portions Copyright (c) Microsoft Corporation. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package cloudprovider + +import ( + . "github.com/onsi/gomega" + karpv1 "sigs.k8s.io/karpenter/pkg/apis/v1" +) + +type provisionModeTestCase struct { + name string + validateNodeClaim func(*karpv1.NodeClaim) + resetCreateCalls func() + expectCreateCalls func() + expectCreatedResource func() + resetListCalls func() + expectListCalls func() + resetGetCalls func() + expectGetCalls func() + resetDeleteCalls func() + expectDeleteCalls func() +} + +func aksMachineAPIHeaderBatchProvisionMode() provisionModeTestCase { + return provisionModeTestCase{ + name: "AKSMachineAPIHeaderBatch", + validateNodeClaim: func(nodeClaim *karpv1.NodeClaim) { + validateAKSMachineNodeClaim(nodeClaim, nodePool) + }, + resetCreateCalls: func() { + azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Reset() + azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Reset() + }, + expectCreateCalls: func() { + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(0)) + }, + expectCreatedResource: func() { + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + Expect(createInput.AKSMachine.Properties).ToNot(BeNil()) + }, + resetListCalls: func() { + azureEnv.AKSMachinesAPI.AKSMachineNewListPagerBehavior.CalledWithInput.Reset() + azureEnv.AzureResourceGraphAPI.AzureResourceGraphResourcesBehavior.CalledWithInput.Reset() + }, + expectListCalls: func() { + Expect(azureEnv.AKSMachinesAPI.AKSMachineNewListPagerBehavior.CalledWithInput.Len()).To(Equal(1)) + Expect(azureEnv.AzureResourceGraphAPI.AzureResourceGraphResourcesBehavior.CalledWithInput.Len()).To(Equal(1)) + }, + resetGetCalls: func() { + azureEnv.AKSMachinesAPI.AKSMachineGetBehavior.CalledWithInput.Reset() + azureEnv.VirtualMachinesAPI.VirtualMachineGetBehavior.CalledWithInput.Reset() + }, + expectGetCalls: func() { + Expect(azureEnv.AKSMachinesAPI.AKSMachineGetBehavior.CalledWithInput.Len()).To(Equal(1)) + Expect(azureEnv.VirtualMachinesAPI.VirtualMachineGetBehavior.CalledWithInput.Len()).To(Equal(0)) + }, + resetDeleteCalls: func() { + azureEnv.AKSAgentPoolsAPI.AgentPoolDeleteMachinesBehavior.CalledWithInput.Reset() + azureEnv.VirtualMachinesAPI.VirtualMachineDeleteBehavior.CalledWithInput.Reset() + }, + expectDeleteCalls: func() { + Expect(azureEnv.AKSAgentPoolsAPI.AgentPoolDeleteMachinesBehavior.CalledWithInput.Len()).To(Equal(1)) + Expect(azureEnv.VirtualMachinesAPI.VirtualMachineDeleteBehavior.CalledWithInput.Len()).To(Equal(0)) + }, + } +} diff --git a/pkg/cloudprovider/suite_offerings_test.go b/pkg/cloudprovider/suite_offerings_test.go index 697e67610..4c58eda2a 100644 --- a/pkg/cloudprovider/suite_offerings_test.go +++ b/pkg/cloudprovider/suite_offerings_test.go @@ -53,877 +53,881 @@ import ( "github.com/Azure/skewer" ) -var _ = Describe("CloudProvider", func() { - Context("ProvisionMode = AKSMachineAPIHeaderBatch", func() { - BeforeEach(func() { - testOptions = test.Options(test.OptionsFields{ - ProvisionMode: lo.ToPtr(consts.ProvisionModeAKSMachineAPIHeaderBatch), - UseSIG: lo.ToPtr(true), +func runAKSMachineAPIOfferingTests() { + Context("Create - Expected Creation Failures", func() { + // Ported from VM test: "should fail to provision when LowPriorityCoresQuota errors are hit, then switch capacity type and succeed" + It("should fail to provision when LowPriorityCoresQuota errors are hit, then switch capacity type and succeed", func() { + // Configure NodePool to allow both spot and on-demand + coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ + Key: karpv1.CapacityTypeLabelKey, + Operator: v1.NodeSelectorOpIn, + Values: []string{karpv1.CapacityTypeOnDemand, karpv1.CapacityTypeSpot}, }) - - ctx = coreoptions.ToContext(ctx, coretest.Options()) - ctx = options.ToContext(ctx, testOptions) - - azureEnv = test.NewEnvironment(ctx, env) - azureEnvNonZonal = test.NewEnvironmentNonZonal(ctx, env) - statusController = status.NewController(env.Client, azureEnv.KubernetesVersionProvider, azureEnv.ImageProvider, env.KubernetesInterface, env.KubernetesInterface, azureEnv.DynamicInterface, azureEnv.SubnetsAPI, azureEnv.DiskEncryptionSetsAPI, testOptions.ParsedDiskEncryptionSetID, options.FromContext(ctx).NetworkPolicy, options.FromContext(ctx).NetworkPlugin) - test.ApplyDefaultStatus(nodeClass, env, testOptions.UseSIG) - cloudProvider = New(azureEnv.InstanceTypesProvider, azureEnv.VMInstanceProvider, azureEnv.AKSMachineProvider, recorder, env.Client, azureEnv.ImageProvider, azureEnv.InstanceTypeStore) - cloudProviderNonZonal = New(azureEnvNonZonal.InstanceTypesProvider, azureEnvNonZonal.VMInstanceProvider, azureEnvNonZonal.AKSMachineProvider, events.NewRecorder(&record.FakeRecorder{}), env.Client, azureEnvNonZonal.ImageProvider, azureEnvNonZonal.InstanceTypeStore) - - cluster = state.NewCluster(fakeClock, env.Client, cloudProvider) - clusterNonZonal = state.NewCluster(fakeClock, env.Client, cloudProviderNonZonal) - coreProvisioner = provisioning.NewProvisioner(env.Client, recorder, cloudProvider, cluster, fakeClock) - coreProvisionerNonZonal = provisioning.NewProvisioner(env.Client, recorder, cloudProviderNonZonal, clusterNonZonal, fakeClock) - - ExpectApplied(ctx, env.Client, nodeClass, nodePool) - ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + + // Set up async error + azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorLowPriorityCoresQuota(fake.Region) + + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectNotScheduled(ctx, env.Client, pod) + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(BeNumerically(">=", 1)) + + // Verify spot capacity type marked as unavailable due to quota error + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + vmSize := lo.FromPtr(createInput.AKSMachine.Properties.Hardware.VMSize) + Expect(*createInput.AKSMachine.Properties.Priority).To(Equal(armcontainerservice.ScaleSetPrioritySpot)) + testSKU := fake.MakeSKU(vmSize) + zone, err := instance.GetAKSLabelZoneFromAKSMachine(&createInput.AKSMachine, fake.Region) + Expect(err).ToNot(HaveOccurred()) + ExpectUnavailable(azureEnv, testSKU, zone, karpv1.CapacityTypeSpot) + + // Clear both error and output for retry - should succeed with on-demand + azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = nil + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + node := ExpectScheduled(ctx, env.Client, pod) + Expect(node.Labels[karpv1.CapacityTypeLabelKey]).To(Equal(karpv1.CapacityTypeOnDemand)) + + // Verify final node count + nodes, err := env.KubernetesInterface.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) + Expect(err).ToNot(HaveOccurred()) + Expect(len(nodes.Items)).To(Equal(1)) + Expect(nodes.Items[0].Labels[karpv1.CapacityTypeLabelKey]).To(Equal(karpv1.CapacityTypeOnDemand)) }) - AfterEach(func() { - // Wait for any async polling goroutines to complete before resetting - cloudProvider.WaitForInstancePromises() - cluster.Reset() - azureEnv.Reset(ctx) - azureEnvNonZonal.Reset(ctx) + // Ported from VM test: "should fail to provision when OverconstrainedZonalAllocation errors are hit, then switch zone and succeed" + It("should fail to provision when OverconstrainedZonalAllocation errors are hit, then switch zone and succeed", func() { + coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ + Key: karpv1.CapacityTypeLabelKey, + Operator: v1.NodeSelectorOpIn, + Values: []string{karpv1.CapacityTypeOnDemand, karpv1.CapacityTypeSpot}}) + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + + // Set up async error via BOTH Error and Output (LRO returns both) + azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorOverconstrainedZonalAllocation() + + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectNotScheduled(ctx, env.Client, pod) + + // Verify the create API was called but failed due to zonal allocation constraint + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(BeNumerically(">=", 1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + initialZone, err := instance.GetAKSLabelZoneFromAKSMachine(&createInput.AKSMachine, fake.Region) + Expect(err).ToNot(HaveOccurred()) + + // Verify initial zone marked as unavailable due to zonal allocation failure + vmSize := lo.FromPtr(createInput.AKSMachine.Properties.Hardware.VMSize) + testSKU := fake.MakeSKU(vmSize) + ExpectUnavailable(azureEnv, testSKU, initialZone, karpv1.CapacityTypeSpot) + + // Clear the error and retry - should succeed with different zone + azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = nil + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + node := ExpectScheduled(ctx, env.Client, pod) + Expect(node.Labels[v1.LabelTopologyZone]).ToNot(Equal(initialZone)) }) - Context("Create - Expected Creation Failures", func() { - // Ported from VM test: "should fail to provision when LowPriorityCoresQuota errors are hit, then switch capacity type and succeed" - It("should fail to provision when LowPriorityCoresQuota errors are hit, then switch capacity type and succeed", func() { - // Configure NodePool to allow both spot and on-demand - coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ + // Ported from VM test: "should fail to provision when OverconstrainedAllocation errors are hit, then switch capacity type and succeed" + It("should fail to provision when OverconstrainedAllocation errors are hit, then switch capacity type and succeed", func() { + // Configure NodePool to allow multiple capacity types + coretest.ReplaceRequirements(nodePool, + karpv1.NodeSelectorRequirementWithMinValues{ Key: karpv1.CapacityTypeLabelKey, Operator: v1.NodeSelectorOpIn, Values: []string{karpv1.CapacityTypeOnDemand, karpv1.CapacityTypeSpot}, - }) - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - - // Set up async error - azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorLowPriorityCoresQuota(fake.Region) + }, + karpv1.NodeSelectorRequirementWithMinValues{ + Key: v1beta1.LabelPlacementScope, + Operator: v1.NodeSelectorOpIn, + Values: []string{v1beta1.PlacementScopeZonal}, + }, + ) + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + + // Set up async error via BOTH Error and Output (LRO returns both) + azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorOverconstrainedAllocation() + + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectNotScheduled(ctx, env.Client, pod) + + // Verify the create API was called but failed due to overconstrained allocation + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(BeNumerically(">=", 1)) + + // Verify spot capacity type marked as unavailable due to allocation error + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + vmSize := lo.FromPtr(createInput.AKSMachine.Properties.Hardware.VMSize) + testSKU := fake.MakeSKU(vmSize) + zone, err := instance.GetAKSLabelZoneFromAKSMachine(&createInput.AKSMachine, fake.Region) + Expect(err).ToNot(HaveOccurred()) + ExpectUnavailable(azureEnv, testSKU, zone, karpv1.CapacityTypeSpot) + + // Clear both error and output for retry - should succeed with on-demand because + // this test constrains the NodePool to zonal placement. + azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = nil + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + node := ExpectScheduled(ctx, env.Client, pod) + Expect(node.Labels[karpv1.CapacityTypeLabelKey]).To(Equal(karpv1.CapacityTypeOnDemand)) + Expect(node.Labels).To(HaveKeyWithValue(v1beta1.LabelPlacementScope, v1beta1.PlacementScopeZonal)) + }) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectNotScheduled(ctx, env.Client, pod) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(BeNumerically(">=", 1)) - - // Verify spot capacity type marked as unavailable due to quota error - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - vmSize := lo.FromPtr(createInput.AKSMachine.Properties.Hardware.VMSize) - Expect(*createInput.AKSMachine.Properties.Priority).To(Equal(armcontainerservice.ScaleSetPrioritySpot)) - testSKU := fake.MakeSKU(vmSize) - zone, err := instance.GetAKSLabelZoneFromAKSMachine(&createInput.AKSMachine, fake.Region) - Expect(err).ToNot(HaveOccurred()) - ExpectUnavailable(azureEnv, testSKU, zone, karpv1.CapacityTypeSpot) + // Ported from VM test: "should fail to provision when AllocationFailure errors are hit, then switch placement and succeed" + It("should fail to provision when AllocationFailure errors are hit, then switch placement and succeed", func() { + // Configure NodePool to allow multiple instance types + coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ + Key: v1.LabelInstanceTypeStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{"Standard_D2_v3", "Standard_D64s_v3"}, + }) + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + + // Set up async error via BOTH Error and Output (LRO returns both) + azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorAllocationFailed() + + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectNotScheduled(ctx, env.Client, pod) + + // Verify the create API was called but failed due to allocation failure + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(BeNumerically(">=", 1)) + aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine + initialVMSize := lo.FromPtr(aksMachine.Properties.Hardware.VMSize) + + // Verify initial VM size marked as unavailable due to allocation failure + zone, err := instance.GetAKSLabelZoneFromAKSMachine(&aksMachine, fake.Region) + Expect(err).ToNot(HaveOccurred()) + ExpectUnavailable(azureEnv, fake.MakeSKU(initialVMSize), zone, karpv1.CapacityTypeSpot) + + // Clear the error and retry - should succeed with the same VM size placed regionally + azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = nil + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + node := ExpectScheduled(ctx, env.Client, pod) + Expect(node.Labels[v1.LabelInstanceTypeStable]).To(Equal(initialVMSize)) + Expect(node.Labels[v1.LabelTopologyZone]).To(Equal(zones.Regional)) + }) - // Clear both error and output for retry - should succeed with on-demand - azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = nil - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - node := ExpectScheduled(ctx, env.Client, pod) - Expect(node.Labels[karpv1.CapacityTypeLabelKey]).To(Equal(karpv1.CapacityTypeOnDemand)) + // Ported from VM test: "should fail to provision when AllocationFailure errors are hit and regional placement is unavailable" + It("should fail to provision when AllocationFailure errors are hit and regional placement is unavailable", func() { + coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ + Key: v1.LabelInstanceTypeStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{"Standard_D2_v3"}, + }) + sku := fake.MakeSKU("Standard_D2_v3") + azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "RegionalUnavailable", sku, zones.Regional, karpv1.CapacityTypeSpot) + azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "RegionalUnavailable", sku, zones.Regional, karpv1.CapacityTypeOnDemand) + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + + azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorAllocationFailed() + + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectNotScheduled(ctx, env.Client, pod) + + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(BeNumerically(">=", 1)) + aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine + zone, err := instance.GetAKSLabelZoneFromAKSMachine(&aksMachine, fake.Region) + Expect(err).ToNot(HaveOccurred()) + ExpectUnavailable(azureEnv, sku, zone, karpv1.CapacityTypeSpot) + ExpectUnavailable(azureEnv, sku, zone, karpv1.CapacityTypeOnDemand) + ExpectUnavailable(azureEnv, sku, zones.Regional, karpv1.CapacityTypeSpot) + ExpectUnavailable(azureEnv, sku, zones.Regional, karpv1.CapacityTypeOnDemand) + + azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = nil + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectNotScheduled(ctx, env.Client, pod) + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(0)) + }) - // Verify final node count - nodes, err := env.KubernetesInterface.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) - Expect(err).ToNot(HaveOccurred()) - Expect(len(nodes.Items)).To(Equal(1)) - Expect(nodes.Items[0].Labels[karpv1.CapacityTypeLabelKey]).To(Equal(karpv1.CapacityTypeOnDemand)) + It("should fail to provision when AllocationFailure errors are hit and all placements for the VM size are unavailable, then switch VM size and succeed", func() { + coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ + Key: v1.LabelInstanceTypeStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{"Standard_D2_v3", "Standard_D64s_v3"}, }) + sku := fake.MakeSKU("Standard_D2_v3") + azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "RegionalUnavailable", sku, zones.Regional, karpv1.CapacityTypeSpot) + azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "RegionalUnavailable", sku, zones.Regional, karpv1.CapacityTypeOnDemand) + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + + azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorAllocationFailed() + + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectNotScheduled(ctx, env.Client, pod) + + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(BeNumerically(">=", 1)) + aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine + initialVMSize := lo.FromPtr(aksMachine.Properties.Hardware.VMSize) + zone, err := instance.GetAKSLabelZoneFromAKSMachine(&aksMachine, fake.Region) + Expect(err).ToNot(HaveOccurred()) + ExpectUnavailable(azureEnv, fake.MakeSKU(initialVMSize), zone, karpv1.CapacityTypeSpot) + ExpectUnavailable(azureEnv, fake.MakeSKU(initialVMSize), zone, karpv1.CapacityTypeOnDemand) + ExpectUnavailable(azureEnv, fake.MakeSKU(initialVMSize), zones.Regional, karpv1.CapacityTypeSpot) + ExpectUnavailable(azureEnv, fake.MakeSKU(initialVMSize), zones.Regional, karpv1.CapacityTypeOnDemand) + + azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = nil + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + node := ExpectScheduled(ctx, env.Client, pod) + Expect(node.Labels[v1.LabelInstanceTypeStable]).ToNot(Equal(initialVMSize)) + }) - // Ported from VM test: "should fail to provision when OverconstrainedZonalAllocation errors are hit, then switch zone and succeed" - It("should fail to provision when OverconstrainedZonalAllocation errors are hit, then switch zone and succeed", func() { - coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ - Key: karpv1.CapacityTypeLabelKey, - Operator: v1.NodeSelectorOpIn, - Values: []string{karpv1.CapacityTypeOnDemand, karpv1.CapacityTypeSpot}}) - ExpectApplied(ctx, env.Client, nodePool, nodeClass) + // Ported from VM test: "should fail to provision when VM SKU family vCPU quota exceeded error is returned, and succeed when it is gone" + It("should fail to provision when VM SKU family vCPU quota exceeded error is returned, and succeed when it is gone", func() { + ExpectApplied(ctx, env.Client, nodePool, nodeClass) - // Set up async error via BOTH Error and Output (LRO returns both) - azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorOverconstrainedZonalAllocation() + // Set up async error via BOTH Error and Output (LRO returns both) + azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorVMFamilyQuotaExceeded("westus2", "Standard NCASv3_T4", 24, 24, 8, 32) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectNotScheduled(ctx, env.Client, pod) + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectNotScheduled(ctx, env.Client, pod) - // Verify the create API was called but failed due to zonal allocation constraint - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(BeNumerically(">=", 1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - initialZone, err := instance.GetAKSLabelZoneFromAKSMachine(&createInput.AKSMachine, fake.Region) - Expect(err).ToNot(HaveOccurred()) + // Verify the create API was called but failed due to family quota + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(BeNumerically(">=", 1)) - // Verify initial zone marked as unavailable due to zonal allocation failure - vmSize := lo.FromPtr(createInput.AKSMachine.Properties.Hardware.VMSize) - testSKU := fake.MakeSKU(vmSize) - ExpectUnavailable(azureEnv, testSKU, initialZone, karpv1.CapacityTypeSpot) + // Clear the error and retry - should succeed + azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = nil + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + }) - // Clear the error and retry - should succeed with different zone - azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = nil - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - node := ExpectScheduled(ctx, env.Client, pod) - Expect(node.Labels[v1.LabelTopologyZone]).ToNot(Equal(initialZone)) - }) + // Ported from VM test: "should fail to provision when VM SKU family vCPU quota limit is zero, and succeed when its gone" + It("should fail to provision when VM SKU family vCPU quota limit is zero, and succeed when its gone", func() { + ExpectApplied(ctx, env.Client, nodePool, nodeClass) - // Ported from VM test: "should fail to provision when OverconstrainedAllocation errors are hit, then switch capacity type and succeed" - It("should fail to provision when OverconstrainedAllocation errors are hit, then switch capacity type and succeed", func() { - // Configure NodePool to allow multiple capacity types - coretest.ReplaceRequirements(nodePool, - karpv1.NodeSelectorRequirementWithMinValues{ - Key: karpv1.CapacityTypeLabelKey, - Operator: v1.NodeSelectorOpIn, - Values: []string{karpv1.CapacityTypeOnDemand, karpv1.CapacityTypeSpot}, - }, - karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1beta1.LabelPlacementScope, - Operator: v1.NodeSelectorOpIn, - Values: []string{v1beta1.PlacementScopeZonal}, - }, - ) - ExpectApplied(ctx, env.Client, nodePool, nodeClass) + // Set up async error via BOTH Error and Output (LRO returns both) + azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorVMFamilyQuotaExceeded("westus2", "Standard NCASv3_T4", 0, 0, 8, 8) - // Set up async error via BOTH Error and Output (LRO returns both) - azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorOverconstrainedAllocation() + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectNotScheduled(ctx, env.Client, pod) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectNotScheduled(ctx, env.Client, pod) + // Verify the create API was called but failed due to zero quota limit + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(BeNumerically(">=", 1)) - // Verify the create API was called but failed due to overconstrained allocation - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(BeNumerically(">=", 1)) + // Clear the error and retry - should succeed + azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = nil + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + }) - // Verify spot capacity type marked as unavailable due to allocation error - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - vmSize := lo.FromPtr(createInput.AKSMachine.Properties.Hardware.VMSize) - testSKU := fake.MakeSKU(vmSize) - zone, err := instance.GetAKSLabelZoneFromAKSMachine(&createInput.AKSMachine, fake.Region) - Expect(err).ToNot(HaveOccurred()) - ExpectUnavailable(azureEnv, testSKU, zone, karpv1.CapacityTypeSpot) + // Ported from VM test: Total Regional Cores quota test pattern + It("should return ICE if Total Regional Cores Quota errors are hit", func() { + // Set up async error via BOTH Error and Output (LRO returns both) + azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorTotalRegionalCoresQuota(fake.Region) - // Clear both error and output for retry - should succeed with on-demand because - // this test constrains the NodePool to zonal placement. - azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = nil - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - node := ExpectScheduled(ctx, env.Client, pod) - Expect(node.Labels[karpv1.CapacityTypeLabelKey]).To(Equal(karpv1.CapacityTypeOnDemand)) - Expect(node.Labels).To(HaveKeyWithValue(v1beta1.LabelPlacementScope, v1beta1.PlacementScopeZonal)) + // Create nodeClaim directly and call cloudProvider.Create like VM tests + testNodeClaim1 := coretest.NodeClaim(karpv1.NodeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + karpv1.NodePoolLabelKey: nodePool.Name, + }, + }, + Spec: karpv1.NodeClaimSpec{ + NodeClassRef: &karpv1.NodeClassReference{ + Name: nodeClass.Name, + Group: object.GVK(nodeClass).Group, + Kind: object.GVK(nodeClass).Kind, + }, + }, }) - // Ported from VM test: "should fail to provision when AllocationFailure errors are hit, then switch placement and succeed" - It("should fail to provision when AllocationFailure errors are hit, then switch placement and succeed", func() { - // Configure NodePool to allow multiple instance types - coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1.LabelInstanceTypeStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{"Standard_D2_v3", "Standard_D64s_v3"}, - }) - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - - // Set up async error via BOTH Error and Output (LRO returns both) - azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorAllocationFailed() + ExpectApplied(ctx, env.Client, nodePool, nodeClass, testNodeClaim1) + claim, err := CreateAndWaitForPromises(ctx, cloudProvider, azureEnv, testNodeClaim1) + Expect(corecloudprovider.IsInsufficientCapacityError(err)).To(BeTrue()) + Expect(claim).To(BeNil()) + }) + }) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectNotScheduled(ctx, env.Client, pod) + // Ported from VM test: "Zone-aware provisioning" + Context("Create - Zone-aware provisioning", func() { + // Ported from VM test: "should launch in the NodePool-requested zone" + It("should launch in the NodePool-requested zone", func() { + zone, aksMachineZone := fmt.Sprintf("%s-3", fake.Region), "3" + nodePool.Spec.Template.Spec.Requirements = []karpv1.NodeSelectorRequirementWithMinValues{ + {Key: karpv1.CapacityTypeLabelKey, Operator: v1.NodeSelectorOpIn, Values: []string{karpv1.CapacityTypeSpot, karpv1.CapacityTypeOnDemand}}, + {Key: v1.LabelTopologyZone, Operator: v1.NodeSelectorOpIn, Values: []string{zone}}, + } + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + node := ExpectScheduled(ctx, env.Client, pod) + Expect(node.Labels).To(HaveKeyWithValue(v1.LabelTopologyZone, zone)) + + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine + Expect(aksMachine).NotTo(BeNil()) + Expect(aksMachine.Zones).To(ConsistOf(&aksMachineZone)) + }) - // Verify the create API was called but failed due to allocation failure - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(BeNumerically(">=", 1)) - aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine - initialVMSize := lo.FromPtr(aksMachine.Properties.Hardware.VMSize) + // Ported from VM test: "should support provisioning in non-zonal regions" + It("should support provisioning in non-zonal regions", func() { + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, clusterNonZonal, cloudProviderNonZonal, coreProvisionerNonZonal, azureEnvNonZonal, pod) + ExpectScheduled(ctx, env.Client, pod) - // Verify initial VM size marked as unavailable due to allocation failure - zone, err := instance.GetAKSLabelZoneFromAKSMachine(&aksMachine, fake.Region) - Expect(err).ToNot(HaveOccurred()) - ExpectUnavailable(azureEnv, fake.MakeSKU(initialVMSize), zone, karpv1.CapacityTypeSpot) + Expect(azureEnvNonZonal.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + aksMachine := azureEnvNonZonal.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine + Expect(aksMachine.Zones).To(BeEmpty()) + }) - // Clear the error and retry - should succeed with the same VM size placed regionally - azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = nil - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - node := ExpectScheduled(ctx, env.Client, pod) - Expect(node.Labels[v1.LabelInstanceTypeStable]).To(Equal(initialVMSize)) - Expect(node.Labels[v1.LabelTopologyZone]).To(Equal(zones.Regional)) + // Ported from VM test: "should support provisioning non-zonal instance types in zonal regions" + It("should support provisioning non-zonal instance types in zonal regions", func() { + coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ + Key: v1.LabelInstanceTypeStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{"Standard_NC6s_v3"}, // Non-zonal instance type }) + ExpectApplied(ctx, env.Client, nodePool, nodeClass) - // Ported from VM test: "should fail to provision when AllocationFailure errors are hit and regional placement is unavailable" - It("should fail to provision when AllocationFailure errors are hit and regional placement is unavailable", func() { - coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1.LabelInstanceTypeStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{"Standard_D2_v3"}, - }) - sku := fake.MakeSKU("Standard_D2_v3") - azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "RegionalUnavailable", sku, zones.Regional, karpv1.CapacityTypeSpot) - azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "RegionalUnavailable", sku, zones.Regional, karpv1.CapacityTypeOnDemand) - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - - azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorAllocationFailed() - - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectNotScheduled(ctx, env.Client, pod) + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(BeNumerically(">=", 1)) - aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine - zone, err := instance.GetAKSLabelZoneFromAKSMachine(&aksMachine, fake.Region) - Expect(err).ToNot(HaveOccurred()) - ExpectUnavailable(azureEnv, sku, zone, karpv1.CapacityTypeSpot) - ExpectUnavailable(azureEnv, sku, zone, karpv1.CapacityTypeOnDemand) - ExpectUnavailable(azureEnv, sku, zones.Regional, karpv1.CapacityTypeSpot) - ExpectUnavailable(azureEnv, sku, zones.Regional, karpv1.CapacityTypeOnDemand) + node := ExpectScheduled(ctx, env.Client, pod) + Expect(node.Labels).To(HaveKeyWithValue(v1.LabelTopologyZone, zones.Regional)) - azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = nil - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectNotScheduled(ctx, env.Client, pod) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(0)) - }) + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine + Expect(aksMachine.Zones).To(BeEmpty()) + }) + }) - It("should fail to provision when AllocationFailure errors are hit and all placements for the VM size are unavailable, then switch VM size and succeed", func() { - coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ + // Ported from VM test: "CloudProvider Create Error Cases" + Context("Create - CloudProvider Create Error Cases", func() { + // Ported from VM test: "should return an ICE error when there are no instance types to launch" + // But, from cloudprovider/suite_test.go rather than instancetype/suite_test.go + It("should return an ICE error when there are no instance types to launch", func() { + // Specify no instance types and expect to receive a capacity error + nodeClaim.Spec.Requirements = []karpv1.NodeSelectorRequirementWithMinValues{ + { Key: v1.LabelInstanceTypeStable, Operator: v1.NodeSelectorOpIn, - Values: []string{"Standard_D2_v3", "Standard_D64s_v3"}, - }) - sku := fake.MakeSKU("Standard_D2_v3") - azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "RegionalUnavailable", sku, zones.Regional, karpv1.CapacityTypeSpot) - azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "RegionalUnavailable", sku, zones.Regional, karpv1.CapacityTypeOnDemand) - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - - azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorAllocationFailed() - - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectNotScheduled(ctx, env.Client, pod) - - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(BeNumerically(">=", 1)) - aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine - initialVMSize := lo.FromPtr(aksMachine.Properties.Hardware.VMSize) - zone, err := instance.GetAKSLabelZoneFromAKSMachine(&aksMachine, fake.Region) - Expect(err).ToNot(HaveOccurred()) - ExpectUnavailable(azureEnv, fake.MakeSKU(initialVMSize), zone, karpv1.CapacityTypeSpot) - ExpectUnavailable(azureEnv, fake.MakeSKU(initialVMSize), zone, karpv1.CapacityTypeOnDemand) - ExpectUnavailable(azureEnv, fake.MakeSKU(initialVMSize), zones.Regional, karpv1.CapacityTypeSpot) - ExpectUnavailable(azureEnv, fake.MakeSKU(initialVMSize), zones.Regional, karpv1.CapacityTypeOnDemand) - - azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = nil - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - node := ExpectScheduled(ctx, env.Client, pod) - Expect(node.Labels[v1.LabelInstanceTypeStable]).ToNot(Equal(initialVMSize)) - }) - - // Ported from VM test: "should fail to provision when VM SKU family vCPU quota exceeded error is returned, and succeed when it is gone" - It("should fail to provision when VM SKU family vCPU quota exceeded error is returned, and succeed when it is gone", func() { - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - - // Set up async error via BOTH Error and Output (LRO returns both) - azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorVMFamilyQuotaExceeded("westus2", "Standard NCASv3_T4", 24, 24, 8, 32) - - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectNotScheduled(ctx, env.Client, pod) - - // Verify the create API was called but failed due to family quota - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(BeNumerically(">=", 1)) + Values: []string{"doesnotexist"}, // will not match any instance types, + }, + } + + ExpectApplied(ctx, env.Client, nodePool, nodeClass, nodeClaim) + cloudProviderMachine, err := CreateAndWaitForPromises(ctx, cloudProvider, azureEnv, nodeClaim) + Expect(corecloudprovider.IsInsufficientCapacityError(err)).To(BeTrue()) + Expect(cloudProviderMachine).To(BeNil()) + }) - // Clear the error and retry - should succeed - azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = nil - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) + // Ported from VM test: "should return error when NodeClass readiness is Unknown" + It("should return error when NodeClass readiness is Unknown", func() { + nodeClass.StatusConditions().SetUnknown(corestatus.ConditionReady) + testNodeClaim2 := coretest.NodeClaim(karpv1.NodeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + karpv1.NodePoolLabelKey: nodePool.Name, + }, + }, + Spec: karpv1.NodeClaimSpec{ + NodeClassRef: &karpv1.NodeClassReference{ + Name: nodeClass.Name, + Group: object.GVK(nodeClass).Group, + Kind: object.GVK(nodeClass).Kind, + }, + }, }) - // Ported from VM test: "should fail to provision when VM SKU family vCPU quota limit is zero, and succeed when its gone" - It("should fail to provision when VM SKU family vCPU quota limit is zero, and succeed when its gone", func() { - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - - // Set up async error via BOTH Error and Output (LRO returns both) - azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorVMFamilyQuotaExceeded("westus2", "Standard NCASv3_T4", 0, 0, 8, 8) + ExpectApplied(ctx, env.Client, nodePool, nodeClass, testNodeClaim2) + claim, err := CreateAndWaitForPromises(ctx, cloudProvider, azureEnv, testNodeClaim2) + Expect(err).To(HaveOccurred()) + Expect(err).To(BeAssignableToTypeOf(&corecloudprovider.CreateError{})) + Expect(claim).To(BeNil()) + Expect(err.Error()).To(ContainSubstring("resolving NodeClass readiness, NodeClass is in Ready=Unknown")) + }) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectNotScheduled(ctx, env.Client, pod) + // Ported from VM test: "should return error when instance type resolution fails" + It("should return error when instance type resolution fails", func() { + // Create and set up the status controller + localStatusController := status.NewController(env.Client, azureEnv.KubernetesVersionProvider, azureEnv.ImageProvider, env.KubernetesInterface, env.KubernetesInterface, azureEnv.DynamicInterface, azureEnv.SubnetsAPI, azureEnv.DiskEncryptionSetsAPI, testOptions.ParsedDiskEncryptionSetID, options.FromContext(ctx).NetworkPolicy, options.FromContext(ctx).NetworkPlugin) - // Verify the create API was called but failed due to zero quota limit - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(BeNumerically(">=", 1)) + // Set NodeClass to Ready + nodeClass.StatusConditions().SetTrue(karpv1.ConditionTypeLaunched) + ExpectApplied(ctx, env.Client, nodePool, nodeClass) - // Clear the error and retry - should succeed - azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = nil - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - }) + // Reconcile the NodeClass to ensure status is updated + ExpectObjectReconciled(ctx, env.Client, localStatusController, nodeClass) - // Ported from VM test: Total Regional Cores quota test pattern - It("should return ICE if Total Regional Cores Quota errors are hit", func() { - // Set up async error via BOTH Error and Output (LRO returns both) - azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorTotalRegionalCoresQuota(fake.Region) + // Flush the cache to simulate the controller not having run yet. + // With the instance type controller, SKU API errors happen during + // UpdateInstanceTypes (controller reconcile), not during List. + // When the cache is empty, List returns an error. + azureEnv.InstanceTypesProvider.Reset() - // Create nodeClaim directly and call cloudProvider.Create like VM tests - testNodeClaim1 := coretest.NodeClaim(karpv1.NodeClaim{ - ObjectMeta: metav1.ObjectMeta{ - Labels: map[string]string{ - karpv1.NodePoolLabelKey: nodePool.Name, - }, + testNodeClaim3 := coretest.NodeClaim(karpv1.NodeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + karpv1.NodePoolLabelKey: nodePool.Name, }, - Spec: karpv1.NodeClaimSpec{ - NodeClassRef: &karpv1.NodeClassReference{ - Name: nodeClass.Name, - Group: object.GVK(nodeClass).Group, - Kind: object.GVK(nodeClass).Kind, - }, + }, + Spec: karpv1.NodeClaimSpec{ + NodeClassRef: &karpv1.NodeClassReference{ + Name: nodeClass.Name, + Group: object.GVK(nodeClass).Group, + Kind: object.GVK(nodeClass).Kind, }, - }) - - ExpectApplied(ctx, env.Client, nodePool, nodeClass, testNodeClaim1) - claim, err := CreateAndWaitForPromises(ctx, cloudProvider, azureEnv, testNodeClaim1) - Expect(corecloudprovider.IsInsufficientCapacityError(err)).To(BeTrue()) - Expect(claim).To(BeNil()) + }, }) - }) - // Ported from VM test: "Zone-aware provisioning" - Context("Create - Zone-aware provisioning", func() { - // Ported from VM test: "should launch in the NodePool-requested zone" - It("should launch in the NodePool-requested zone", func() { - zone, aksMachineZone := fmt.Sprintf("%s-3", fake.Region), "3" - nodePool.Spec.Template.Spec.Requirements = []karpv1.NodeSelectorRequirementWithMinValues{ - {Key: karpv1.CapacityTypeLabelKey, Operator: v1.NodeSelectorOpIn, Values: []string{karpv1.CapacityTypeSpot, karpv1.CapacityTypeOnDemand}}, - {Key: v1.LabelTopologyZone, Operator: v1.NodeSelectorOpIn, Values: []string{zone}}, - } - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - node := ExpectScheduled(ctx, env.Client, pod) - Expect(node.Labels).To(HaveKeyWithValue(v1.LabelTopologyZone, zone)) + claim, err := CreateAndWaitForPromises(ctx, cloudProvider, azureEnv, testNodeClaim3) + Expect(err).To(HaveOccurred()) + Expect(err).To(BeAssignableToTypeOf(&corecloudprovider.CreateError{})) + Expect(claim).To(BeNil()) + Expect(err.Error()).To(ContainSubstring("resolving instance types")) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine - Expect(aksMachine).NotTo(BeNil()) - Expect(aksMachine.Zones).To(ConsistOf(&aksMachineZone)) - }) + // Reset instance types + Expect(azureEnv.InstanceTypesProvider.UpdateInstanceTypes(ctx)).To(Succeed()) + }) - // Ported from VM test: "should support provisioning in non-zonal regions" - It("should support provisioning in non-zonal regions", func() { - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, clusterNonZonal, cloudProviderNonZonal, coreProvisionerNonZonal, azureEnvNonZonal, pod) - ExpectScheduled(ctx, env.Client, pod) + // Ported from VM test: "should return error when instance creation fails" + It("should return error when instance creation fails", func() { + ExpectApplied(ctx, env.Client, nodePool, nodeClass) - Expect(azureEnvNonZonal.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - aksMachine := azureEnvNonZonal.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine - Expect(aksMachine.Zones).To(BeEmpty()) + // Create a NodeClaim with valid requirements + testNodeClaim4 := coretest.NodeClaim(karpv1.NodeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + karpv1.NodePoolLabelKey: nodePool.Name, + }, + }, + Spec: karpv1.NodeClaimSpec{ + NodeClassRef: &karpv1.NodeClassReference{ + Name: nodeClass.Name, + Group: object.GVK(nodeClass).Group, + Kind: object.GVK(nodeClass).Kind, + }, + }, }) - // Ported from VM test: "should support provisioning non-zonal instance types in zonal regions" - It("should support provisioning non-zonal instance types in zonal regions", func() { - coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1.LabelInstanceTypeStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{"Standard_NC6s_v3"}, // Non-zonal instance type - }) - ExpectApplied(ctx, env.Client, nodePool, nodeClass) + // Set up the AKS machine provider to fail (different from VM API) + azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorAny() - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + claim, err := CreateAndWaitForPromises(ctx, cloudProvider, azureEnv, testNodeClaim4) + Expect(err).To(HaveOccurred()) + Expect(err).To(BeAssignableToTypeOf(&corecloudprovider.CreateError{})) + Expect(claim).To(BeNil()) + Expect(err.Error()).To(ContainSubstring("creating AKS machine failed")) + }) + }) - node := ExpectScheduled(ctx, env.Client, pod) - Expect(node.Labels).To(HaveKeyWithValue(v1.LabelTopologyZone, zones.Regional)) + // Mostly ported from VM test: "Provider list" + Context("Create - Provider list", func() { + // Ported from VM test: "should support individual instance type labels" + // TODO(mattchr): rework this from VM test (new additions) + // It("should support individual instance type labels", func() { + // ExpectApplied(ctx, env.Client, nodePool, nodeClass) + + // nodeSelector := map[string]string{ + // // Well known + // v1.LabelTopologyRegion: fake.Region, + // karpv1.NodePoolLabelKey: nodePool.Name, + // v1.LabelTopologyZone: fakeZone1, + // v1.LabelInstanceTypeStable: "Standard_NC24ads_A100_v4", + // v1.LabelOSStable: "linux", + // v1.LabelArchStable: "amd64", + // karpv1.CapacityTypeLabelKey: "on-demand", + // // Well Known to AKS + // v1beta1.LabelSKUName: "Standard_NC24ads_A100_v4", + // v1beta1.LabelSKUFamily: "N", + // v1beta1.LabelSKUVersion: "4", + // v1beta1.LabelSKUStorageEphemeralOSMaxSize: "429", + // v1beta1.LabelSKUAcceleratedNetworking: "true", + // v1beta1.LabelSKUStoragePremiumCapable: "true", + // v1beta1.LabelSKUGPUName: "A100", + // v1beta1.LabelSKUGPUManufacturer: "nvidia", + // v1beta1.LabelSKUGPUCount: "1", + // v1beta1.LabelSKUCPU: "24", + // v1beta1.LabelSKUMemory: "8192", + // // Deprecated Labels + // v1.LabelFailureDomainBetaRegion: fake.Region, + // v1.LabelFailureDomainBetaZone: fakeZone1, + // "beta.kubernetes.io/arch": "amd64", + // "beta.kubernetes.io/os": "linux", + // v1.LabelInstanceType: "Standard_NC24ads_A100_v4", + // "topology.disk.csi.azure.com/zone": fakeZone1, + // v1.LabelWindowsBuild: "window", + // // Cluster Label + // v1beta1.AKSLabelCluster: "test-cluster", + // } + + // // Ensure that we're exercising all well known labels + // Expect(lo.Keys(nodeSelector)).To(ContainElements(append(karpv1.WellKnownLabels.UnsortedList(), lo.Keys(karpv1.NormalizedLabels)...))) + + // var pods []*v1.Pod + // for key, value := range nodeSelector { + // pods = append(pods, coretest.UnschedulablePod(coretest.PodOptions{NodeSelector: map[string]string{key: value}})) + // } + // ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pods...) + // for _, pod := range pods { + // ExpectScheduled(ctx, env.Client, pod) + // } + // }) + }) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine - Expect(aksMachine.Zones).To(BeEmpty()) - }) + // Ported from VM test: "Unavailable Offerings" + Context("Create - Unavailable Offerings", func() { + // Ported from VM test: "should not allocate a vm in a zone marked as unavailable" + It("should not allocate an AKS machine in a zone marked as unavailable", func() { + azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "ZonalAllocationFailure", fake.MakeSKU("Standard_D2_v2"), fakeZone1, karpv1.CapacityTypeSpot) + azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "ZonalAllocationFailure", fake.MakeSKU("Standard_D2_v2"), fakeZone1, karpv1.CapacityTypeOnDemand) + coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ + Key: v1.LabelInstanceTypeStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{"Standard_D2_v2"}}) + + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + node := ExpectScheduled(ctx, env.Client, pod) + Expect(node.Labels[v1.LabelTopologyZone]).ToNot(Equal(fakeZone1)) + Expect(node.Labels[v1.LabelInstanceTypeStable]).To(Equal("Standard_D2_v2")) }) - // Ported from VM test: "CloudProvider Create Error Cases" - Context("Create - CloudProvider Create Error Cases", func() { - // Ported from VM test: "should return an ICE error when there are no instance types to launch" - // But, from cloudprovider/suite_test.go rather than instancetype/suite_test.go - It("should return an ICE error when there are no instance types to launch", func() { - // Specify no instance types and expect to receive a capacity error - nodeClaim.Spec.Requirements = []karpv1.NodeSelectorRequirementWithMinValues{ - { - Key: v1.LabelInstanceTypeStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{"doesnotexist"}, // will not match any instance types, + // Ported from VM test: "should handle ZonalAllocationFailed on creating the VM" + It("should handle ZonalAllocationFailed on creating the AKS machine", func() { + // Set up async error via BOTH Error and Output (LRO returns both) + azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorZoneAllocationFailed("Standard_D2_v2", "1") + + coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ + Key: v1.LabelInstanceTypeStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{"Standard_D2_v2"}}) + + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectNotScheduled(ctx, env.Client, pod) + + By("marking whatever zone was picked as unavailable - for both spot and on-demand") + // When ZonalAllocationFailed error is encountered, we block all VM sizes that have >= vCPUs as the VM size for which we encountered the error + expectedUnavailableSKUs := []*skewer.SKU{ + { + Name: lo.ToPtr("Standard_D2_v2"), + Size: lo.ToPtr("D2_v2"), + Family: lo.ToPtr("StandardDv2Family"), + Capabilities: &[]compute.ResourceSkuCapabilities{ + { + Name: lo.ToPtr("vCPUs"), + Value: lo.ToPtr("2"), + }, }, - } - - ExpectApplied(ctx, env.Client, nodePool, nodeClass, nodeClaim) - cloudProviderMachine, err := CreateAndWaitForPromises(ctx, cloudProvider, azureEnv, nodeClaim) - Expect(corecloudprovider.IsInsufficientCapacityError(err)).To(BeTrue()) - Expect(cloudProviderMachine).To(BeNil()) - }) - - // Ported from VM test: "should return error when NodeClass readiness is Unknown" - It("should return error when NodeClass readiness is Unknown", func() { - nodeClass.StatusConditions().SetUnknown(corestatus.ConditionReady) - testNodeClaim2 := coretest.NodeClaim(karpv1.NodeClaim{ - ObjectMeta: metav1.ObjectMeta{ - Labels: map[string]string{ - karpv1.NodePoolLabelKey: nodePool.Name, + }, + { + Name: lo.ToPtr("Standard_D16_v2"), + Size: lo.ToPtr("D16_v2"), + Family: lo.ToPtr("StandardDv2Family"), + Capabilities: &[]compute.ResourceSkuCapabilities{ + { + Name: lo.ToPtr("vCPUs"), + Value: lo.ToPtr("16"), }, }, - Spec: karpv1.NodeClaimSpec{ - NodeClassRef: &karpv1.NodeClassReference{ - Name: nodeClass.Name, - Group: object.GVK(nodeClass).Group, - Kind: object.GVK(nodeClass).Kind, + }, + { + Name: lo.ToPtr("Standard_D32_v2"), + Size: lo.ToPtr("D32_v2"), + Family: lo.ToPtr("StandardDv2Family"), + Capabilities: &[]compute.ResourceSkuCapabilities{ + { + Name: lo.ToPtr("vCPUs"), + Value: lo.ToPtr("32"), }, }, - }) + }, + } - ExpectApplied(ctx, env.Client, nodePool, nodeClass, testNodeClaim2) - claim, err := CreateAndWaitForPromises(ctx, cloudProvider, azureEnv, testNodeClaim2) - Expect(err).To(HaveOccurred()) - Expect(err).To(BeAssignableToTypeOf(&corecloudprovider.CreateError{})) - Expect(claim).To(BeNil()) - Expect(err.Error()).To(ContainSubstring("resolving NodeClass readiness, NodeClass is in Ready=Unknown")) - }) + // For AKS Machine API, we need to determine the zone from the machine creation attempt + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(BeNumerically(">", 0)) + machineInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - // Ported from VM test: "should return error when instance type resolution fails" - It("should return error when instance type resolution fails", func() { - // Create and set up the status controller - localStatusController := status.NewController(env.Client, azureEnv.KubernetesVersionProvider, azureEnv.ImageProvider, env.KubernetesInterface, env.KubernetesInterface, azureEnv.DynamicInterface, azureEnv.SubnetsAPI, azureEnv.DiskEncryptionSetsAPI, testOptions.ParsedDiskEncryptionSetID, options.FromContext(ctx).NetworkPolicy, options.FromContext(ctx).NetworkPlugin) + // Extract zone from AKS machine - similar to VM test pattern + failedZone, err := instance.GetAKSLabelZoneFromAKSMachine(&machineInput.AKSMachine, fake.Region) + Expect(err).ToNot(HaveOccurred()) - // Set NodeClass to Ready - nodeClass.StatusConditions().SetTrue(karpv1.ConditionTypeLaunched) - ExpectApplied(ctx, env.Client, nodePool, nodeClass) + for _, skuToCheck := range expectedUnavailableSKUs { + Expect(azureEnv.UnavailableOfferingsCache.IsUnavailable(skuToCheck, failedZone, karpv1.CapacityTypeSpot)).To(BeTrue()) + Expect(azureEnv.UnavailableOfferingsCache.IsUnavailable(skuToCheck, failedZone, karpv1.CapacityTypeOnDemand)).To(BeTrue()) + } - // Reconcile the NodeClass to ensure status is updated - ExpectObjectReconciled(ctx, env.Client, localStatusController, nodeClass) + By("successfully scheduling in a different zone on retry") + // Clear the error and verify retry succeeds in different zone + azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = nil - // Flush the cache to simulate the controller not having run yet. - // With the instance type controller, SKU API errors happen during - // UpdateInstanceTypes (controller reconcile), not during List. - // When the cache is empty, List returns an error. - azureEnv.InstanceTypesProvider.Reset() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + node := ExpectScheduled(ctx, env.Client, pod) - testNodeClaim3 := coretest.NodeClaim(karpv1.NodeClaim{ - ObjectMeta: metav1.ObjectMeta{ - Labels: map[string]string{ - karpv1.NodePoolLabelKey: nodePool.Name, - }, - }, - Spec: karpv1.NodeClaimSpec{ - NodeClassRef: &karpv1.NodeClassReference{ - Name: nodeClass.Name, - Group: object.GVK(nodeClass).Group, - Kind: object.GVK(nodeClass).Kind, - }, - }, - }) + // Verify machine was created in a different zone than the failed one + Expect(node.Labels[v1.LabelTopologyZone]).ToNot(Equal(failedZone)) + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(BeNumerically(">", 0)) + }) - claim, err := CreateAndWaitForPromises(ctx, cloudProvider, azureEnv, testNodeClaim3) - Expect(err).To(HaveOccurred()) - Expect(err).To(BeAssignableToTypeOf(&corecloudprovider.CreateError{})) - Expect(claim).To(BeNil()) - Expect(err.Error()).To(ContainSubstring("resolving instance types")) + // Ported from VM test: DescribeTable "Should not return unavailable offerings" + Context("should not return unavailable offerings", func() { + It("should leave regional offerings available when all real zones are unavailable", func() { + for _, zone := range azureEnv.Zones() { + azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_D2_v2"), zone, karpv1.CapacityTypeSpot) + azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_D2_v2"), zone, karpv1.CapacityTypeOnDemand) + } + instanceTypes, err := azureEnv.InstanceTypesProvider.List(ctx, nodeClass) + Expect(err).ToNot(HaveOccurred()) - // Reset instance types - Expect(azureEnv.InstanceTypesProvider.UpdateInstanceTypes(ctx)).To(Succeed()) + seeUnavailable := false + for _, instanceType := range instanceTypes { + if instanceType.Name == "Standard_D2_v2" { + // We want to validate we see the offering in the list, + // but only the regional offerings should remain available. + seeUnavailable = true + Expect(lo.Map(instanceType.Offerings.Available(), func(offering *corecloudprovider.Offering, _ int) string { + return offering.Requirements.Get(v1.LabelTopologyZone).Any() + })).To(ConsistOf(zones.Regional, zones.Regional)) + } else { + Expect(len(instanceType.Offerings.Available())).To(Not(Equal(0))) + } + } + // we should see the unavailable offering in the list + Expect(seeUnavailable).To(BeTrue()) }) + It("should not return unavailable offerings - non-zonal", func() { + for _, zone := range azureEnvNonZonal.Zones() { + azureEnvNonZonal.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_D2_v2"), zone, karpv1.CapacityTypeSpot) + azureEnvNonZonal.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_D2_v2"), zone, karpv1.CapacityTypeOnDemand) + } + instanceTypes, err := azureEnvNonZonal.InstanceTypesProvider.List(ctx, nodeClass) + Expect(err).ToNot(HaveOccurred()) - // Ported from VM test: "should return error when instance creation fails" - It("should return error when instance creation fails", func() { - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - - // Create a NodeClaim with valid requirements - testNodeClaim4 := coretest.NodeClaim(karpv1.NodeClaim{ - ObjectMeta: metav1.ObjectMeta{ - Labels: map[string]string{ - karpv1.NodePoolLabelKey: nodePool.Name, - }, - }, - Spec: karpv1.NodeClaimSpec{ - NodeClassRef: &karpv1.NodeClassReference{ - Name: nodeClass.Name, - Group: object.GVK(nodeClass).Group, - Kind: object.GVK(nodeClass).Kind, - }, - }, - }) - - // Set up the AKS machine provider to fail (different from VM API) - azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorAny() - - claim, err := CreateAndWaitForPromises(ctx, cloudProvider, azureEnv, testNodeClaim4) - Expect(err).To(HaveOccurred()) - Expect(err).To(BeAssignableToTypeOf(&corecloudprovider.CreateError{})) - Expect(claim).To(BeNil()) - Expect(err.Error()).To(ContainSubstring("creating AKS machine failed")) + seeUnavailable := false + for _, instanceType := range instanceTypes { + if instanceType.Name == "Standard_D2_v2" { + // We want to validate we see the offering in the list, + // but we also expect it to not have any available offerings + seeUnavailable = true + Expect(len(instanceType.Offerings.Available())).To(Equal(0)) + } else { + Expect(len(instanceType.Offerings.Available())).To(Not(Equal(0))) + } + } + // we should see the unavailable offering in the list + Expect(seeUnavailable).To(BeTrue()) }) }) - // Mostly ported from VM test: "Provider list" - Context("Create - Provider list", func() { - // Ported from VM test: "should support individual instance type labels" - // TODO(mattchr): rework this from VM test (new additions) - // It("should support individual instance type labels", func() { - // ExpectApplied(ctx, env.Client, nodePool, nodeClass) - - // nodeSelector := map[string]string{ - // // Well known - // v1.LabelTopologyRegion: fake.Region, - // karpv1.NodePoolLabelKey: nodePool.Name, - // v1.LabelTopologyZone: fakeZone1, - // v1.LabelInstanceTypeStable: "Standard_NC24ads_A100_v4", - // v1.LabelOSStable: "linux", - // v1.LabelArchStable: "amd64", - // karpv1.CapacityTypeLabelKey: "on-demand", - // // Well Known to AKS - // v1beta1.LabelSKUName: "Standard_NC24ads_A100_v4", - // v1beta1.LabelSKUFamily: "N", - // v1beta1.LabelSKUVersion: "4", - // v1beta1.LabelSKUStorageEphemeralOSMaxSize: "429", - // v1beta1.LabelSKUAcceleratedNetworking: "true", - // v1beta1.LabelSKUStoragePremiumCapable: "true", - // v1beta1.LabelSKUGPUName: "A100", - // v1beta1.LabelSKUGPUManufacturer: "nvidia", - // v1beta1.LabelSKUGPUCount: "1", - // v1beta1.LabelSKUCPU: "24", - // v1beta1.LabelSKUMemory: "8192", - // // Deprecated Labels - // v1.LabelFailureDomainBetaRegion: fake.Region, - // v1.LabelFailureDomainBetaZone: fakeZone1, - // "beta.kubernetes.io/arch": "amd64", - // "beta.kubernetes.io/os": "linux", - // v1.LabelInstanceType: "Standard_NC24ads_A100_v4", - // "topology.disk.csi.azure.com/zone": fakeZone1, - // v1.LabelWindowsBuild: "window", - // // Cluster Label - // v1beta1.AKSLabelCluster: "test-cluster", - // } - - // // Ensure that we're exercising all well known labels - // Expect(lo.Keys(nodeSelector)).To(ContainElements(append(karpv1.WellKnownLabels.UnsortedList(), lo.Keys(karpv1.NormalizedLabels)...))) - - // var pods []*v1.Pod - // for key, value := range nodeSelector { - // pods = append(pods, coretest.UnschedulablePod(coretest.PodOptions{NodeSelector: map[string]string{key: value}})) - // } - // ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pods...) - // for _, pod := range pods { - // ExpectScheduled(ctx, env.Client, pod) - // } - // }) - }) - - // Ported from VM test: "Unavailable Offerings" - Context("Create - Unavailable Offerings", func() { - // Ported from VM test: "should not allocate a vm in a zone marked as unavailable" - It("should not allocate an AKS machine in a zone marked as unavailable", func() { - azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "ZonalAllocationFailure", fake.MakeSKU("Standard_D2_v2"), fakeZone1, karpv1.CapacityTypeSpot) - azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "ZonalAllocationFailure", fake.MakeSKU("Standard_D2_v2"), fakeZone1, karpv1.CapacityTypeOnDemand) - coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1.LabelInstanceTypeStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{"Standard_D2_v2"}}) + // Ported from VM test: "should launch instances in a different zone than preferred" + It("should launch instances in a different zone than preferred when zone is unavailable", func() { + azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "ZonalAllocationFailure", fake.MakeSKU("Standard_D2_v2"), fakeZone1, karpv1.CapacityTypeOnDemand) + azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "ZonalAllocationFailure", fake.MakeSKU("Standard_D2_v2"), fakeZone1, karpv1.CapacityTypeSpot) - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - node := ExpectScheduled(ctx, env.Client, pod) - Expect(node.Labels[v1.LabelTopologyZone]).ToNot(Equal(fakeZone1)) - Expect(node.Labels[v1.LabelInstanceTypeStable]).To(Equal("Standard_D2_v2")) + ExpectApplied(ctx, env.Client, nodeClass, nodePool) + pod := coretest.UnschedulablePod(coretest.PodOptions{ + NodeSelector: map[string]string{v1.LabelInstanceTypeStable: "Standard_D2_v2"}, }) - - // Ported from VM test: "should handle ZonalAllocationFailed on creating the VM" - It("should handle ZonalAllocationFailed on creating the AKS machine", func() { - // Set up async error via BOTH Error and Output (LRO returns both) - azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorZoneAllocationFailed("Standard_D2_v2", "1") - - coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1.LabelInstanceTypeStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{"Standard_D2_v2"}}) - - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectNotScheduled(ctx, env.Client, pod) - - By("marking whatever zone was picked as unavailable - for both spot and on-demand") - // When ZonalAllocationFailed error is encountered, we block all VM sizes that have >= vCPUs as the VM size for which we encountered the error - expectedUnavailableSKUs := []*skewer.SKU{ - { - Name: lo.ToPtr("Standard_D2_v2"), - Size: lo.ToPtr("D2_v2"), - Family: lo.ToPtr("StandardDv2Family"), - Capabilities: &[]compute.ResourceSkuCapabilities{ - { - Name: lo.ToPtr("vCPUs"), - Value: lo.ToPtr("2"), + pod.Spec.Affinity = &v1.Affinity{ + NodeAffinity: &v1.NodeAffinity{ + PreferredDuringSchedulingIgnoredDuringExecution: []v1.PreferredSchedulingTerm{ + { + Weight: 1, + Preference: v1.NodeSelectorTerm{ + MatchExpressions: []v1.NodeSelectorRequirement{ + { + Key: v1.LabelTopologyZone, Operator: v1.NodeSelectorOpIn, Values: []string{fakeZone1}, + }, + }, }, }, }, - { - Name: lo.ToPtr("Standard_D16_v2"), - Size: lo.ToPtr("D16_v2"), - Family: lo.ToPtr("StandardDv2Family"), - Capabilities: &[]compute.ResourceSkuCapabilities{ - { - Name: lo.ToPtr("vCPUs"), - Value: lo.ToPtr("16"), - }, - }, + }, + } + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + node := ExpectScheduled(ctx, env.Client, pod) + Expect(node.Labels[v1.LabelTopologyZone]).ToNot(Equal(fakeZone1)) + Expect(node.Labels[v1.LabelInstanceTypeStable]).To(Equal("Standard_D2_v2")) + }) + + // Ported from VM test: "should launch smaller instances than optimal if larger instance launch results in Insufficient Capacity Error" + It("should launch smaller instances than optimal if larger instance launch results in Insufficient Capacity Error", func() { + azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_F16s_v2"), fakeZone1, karpv1.CapacityTypeOnDemand) + azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_F16s_v2"), fakeZone1, karpv1.CapacityTypeSpot) + coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ + Key: v1.LabelInstanceTypeStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{"Standard_DS2_v2", "Standard_F16s_v2"}}) + pods := []*v1.Pod{} + for i := 0; i < 2; i++ { + pods = append(pods, coretest.UnschedulablePod(coretest.PodOptions{ + ResourceRequirements: v1.ResourceRequirements{ + Requests: v1.ResourceList{v1.ResourceCPU: resource.MustParse("1")}, }, - { - Name: lo.ToPtr("Standard_D32_v2"), - Size: lo.ToPtr("D32_v2"), - Family: lo.ToPtr("StandardDv2Family"), - Capabilities: &[]compute.ResourceSkuCapabilities{ - { - Name: lo.ToPtr("vCPUs"), - Value: lo.ToPtr("32"), - }, - }, + NodeSelector: map[string]string{ + v1.LabelTopologyZone: fakeZone1, }, - } - - // For AKS Machine API, we need to determine the zone from the machine creation attempt - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(BeNumerically(">", 0)) - machineInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + })) + } + // Provisions 2 smaller instances since larger was ICE'd + ExpectApplied(ctx, env.Client, nodeClass, nodePool) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pods...) - // Extract zone from AKS machine - similar to VM test pattern - failedZone, err := instance.GetAKSLabelZoneFromAKSMachine(&machineInput.AKSMachine, fake.Region) - Expect(err).ToNot(HaveOccurred()) + nodeNames := sets.New[string]() + for _, pod := range pods { + node := ExpectScheduled(ctx, env.Client, pod) + Expect(node.Labels[v1.LabelInstanceTypeStable]).To(Equal("Standard_DS2_v2")) + nodeNames.Insert(node.Name) + } + Expect(nodeNames.Len()).To(Equal(2)) + }) - for _, skuToCheck := range expectedUnavailableSKUs { - Expect(azureEnv.UnavailableOfferingsCache.IsUnavailable(skuToCheck, failedZone, karpv1.CapacityTypeSpot)).To(BeTrue()) - Expect(azureEnv.UnavailableOfferingsCache.IsUnavailable(skuToCheck, failedZone, karpv1.CapacityTypeOnDemand)).To(BeTrue()) + // Ported from VM test: "should launch instances on later reconciliation attempt with Insufficient Capacity Error Cache expiry" + Context("should launch instances on later reconciliation attempt with Insufficient Capacity Error Cache expiry", func() { + It("should launch instances on later reconciliation attempt with Insufficient Capacity Error Cache expiry - zonal", func() { + for _, zone := range azureEnv.Zones() { + azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_D2_v2"), zone, karpv1.CapacityTypeSpot) + azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_D2_v2"), zone, karpv1.CapacityTypeOnDemand) } + azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_D2_v2"), zones.Regional, karpv1.CapacityTypeSpot) + azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_D2_v2"), zones.Regional, karpv1.CapacityTypeOnDemand) - By("successfully scheduling in a different zone on retry") - // Clear the error and verify retry succeeds in different zone - azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = nil + ExpectApplied(ctx, env.Client, nodeClass, nodePool) + pod := coretest.UnschedulablePod(coretest.PodOptions{ + NodeSelector: map[string]string{v1.LabelInstanceTypeStable: "Standard_D2_v2"}, + }) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectNotScheduled(ctx, env.Client, pod) + // capacity shortage is over - expire the items from the cache and try again + azureEnv.UnavailableOfferingsCache.Flush() ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) node := ExpectScheduled(ctx, env.Client, pod) - - // Verify machine was created in a different zone than the failed one - Expect(node.Labels[v1.LabelTopologyZone]).ToNot(Equal(failedZone)) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(BeNumerically(">", 0)) - }) - - // Ported from VM test: DescribeTable "Should not return unavailable offerings" - Context("should not return unavailable offerings", func() { - It("should leave regional offerings available when all real zones are unavailable", func() { - for _, zone := range azureEnv.Zones() { - azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_D2_v2"), zone, karpv1.CapacityTypeSpot) - azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_D2_v2"), zone, karpv1.CapacityTypeOnDemand) - } - instanceTypes, err := azureEnv.InstanceTypesProvider.List(ctx, nodeClass) - Expect(err).ToNot(HaveOccurred()) - - seeUnavailable := false - for _, instanceType := range instanceTypes { - if instanceType.Name == "Standard_D2_v2" { - // We want to validate we see the offering in the list, - // but only the regional offerings should remain available. - seeUnavailable = true - Expect(lo.Map(instanceType.Offerings.Available(), func(offering *corecloudprovider.Offering, _ int) string { - return offering.Requirements.Get(v1.LabelTopologyZone).Any() - })).To(ConsistOf(zones.Regional, zones.Regional)) - } else { - Expect(len(instanceType.Offerings.Available())).To(Not(Equal(0))) - } - } - // we should see the unavailable offering in the list - Expect(seeUnavailable).To(BeTrue()) - }) - It("should not return unavailable offerings - non-zonal", func() { - for _, zone := range azureEnvNonZonal.Zones() { - azureEnvNonZonal.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_D2_v2"), zone, karpv1.CapacityTypeSpot) - azureEnvNonZonal.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_D2_v2"), zone, karpv1.CapacityTypeOnDemand) - } - instanceTypes, err := azureEnvNonZonal.InstanceTypesProvider.List(ctx, nodeClass) - Expect(err).ToNot(HaveOccurred()) - - seeUnavailable := false - for _, instanceType := range instanceTypes { - if instanceType.Name == "Standard_D2_v2" { - // We want to validate we see the offering in the list, - // but we also expect it to not have any available offerings - seeUnavailable = true - Expect(len(instanceType.Offerings.Available())).To(Equal(0)) - } else { - Expect(len(instanceType.Offerings.Available())).To(Not(Equal(0))) - } - } - // we should see the unavailable offering in the list - Expect(seeUnavailable).To(BeTrue()) - }) + Expect(node.Labels).To(HaveKeyWithValue(v1.LabelInstanceTypeStable, "Standard_D2_v2")) }) - - // Ported from VM test: "should launch instances in a different zone than preferred" - It("should launch instances in a different zone than preferred when zone is unavailable", func() { - azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "ZonalAllocationFailure", fake.MakeSKU("Standard_D2_v2"), fakeZone1, karpv1.CapacityTypeOnDemand) - azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "ZonalAllocationFailure", fake.MakeSKU("Standard_D2_v2"), fakeZone1, karpv1.CapacityTypeSpot) + It("should launch instances on later reconciliation attempt with Insufficient Capacity Error Cache expiry - non-zonal", func() { + for _, zone := range azureEnvNonZonal.Zones() { + azureEnvNonZonal.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_D2_v2"), zone, karpv1.CapacityTypeSpot) + azureEnvNonZonal.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_D2_v2"), zone, karpv1.CapacityTypeOnDemand) + } ExpectApplied(ctx, env.Client, nodeClass, nodePool) pod := coretest.UnschedulablePod(coretest.PodOptions{ NodeSelector: map[string]string{v1.LabelInstanceTypeStable: "Standard_D2_v2"}, }) - pod.Spec.Affinity = &v1.Affinity{ - NodeAffinity: &v1.NodeAffinity{ - PreferredDuringSchedulingIgnoredDuringExecution: []v1.PreferredSchedulingTerm{ - { - Weight: 1, - Preference: v1.NodeSelectorTerm{ - MatchExpressions: []v1.NodeSelectorRequirement{ - { - Key: v1.LabelTopologyZone, Operator: v1.NodeSelectorOpIn, Values: []string{fakeZone1}, - }, - }, - }, - }, - }, - }, - } - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, clusterNonZonal, cloudProviderNonZonal, coreProvisionerNonZonal, azureEnvNonZonal, pod) + ExpectNotScheduled(ctx, env.Client, pod) + + // capacity shortage is over - expire the items from the cache and try again + azureEnvNonZonal.UnavailableOfferingsCache.Flush() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, clusterNonZonal, cloudProviderNonZonal, coreProvisionerNonZonal, azureEnvNonZonal, pod) node := ExpectScheduled(ctx, env.Client, pod) - Expect(node.Labels[v1.LabelTopologyZone]).ToNot(Equal(fakeZone1)) - Expect(node.Labels[v1.LabelInstanceTypeStable]).To(Equal("Standard_D2_v2")) + Expect(node.Labels).To(HaveKeyWithValue(v1.LabelInstanceTypeStable, "Standard_D2_v2")) }) + }) - // Ported from VM test: "should launch smaller instances than optimal if larger instance launch results in Insufficient Capacity Error" - It("should launch smaller instances than optimal if larger instance launch results in Insufficient Capacity Error", func() { - azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_F16s_v2"), fakeZone1, karpv1.CapacityTypeOnDemand) - azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_F16s_v2"), fakeZone1, karpv1.CapacityTypeSpot) - coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1.LabelInstanceTypeStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{"Standard_DS2_v2", "Standard_F16s_v2"}}) - pods := []*v1.Pod{} - for i := 0; i < 2; i++ { - pods = append(pods, coretest.UnschedulablePod(coretest.PodOptions{ - ResourceRequirements: v1.ResourceRequirements{ - Requests: v1.ResourceList{v1.ResourceCPU: resource.MustParse("1")}, - }, - NodeSelector: map[string]string{ - v1.LabelTopologyZone: fakeZone1, - }, - })) - } - // Provisions 2 smaller instances since larger was ICE'd - ExpectApplied(ctx, env.Client, nodeClass, nodePool) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pods...) + // Ported from VM test context: "SkuNotAvailable" + Context("SKUNotAvailable", func() { + AssertUnavailable := func(sku *skewer.SKU, capacityType string) { + // Simulate SKU not available error via AKS Machine API + azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorSkuNotAvailable(sku.GetName(), fake.Region) - nodeNames := sets.New[string]() - for _, pod := range pods { - node := ExpectScheduled(ctx, env.Client, pod) - Expect(node.Labels[v1.LabelInstanceTypeStable]).To(Equal("Standard_DS2_v2")) - nodeNames.Insert(node.Name) + coretest.ReplaceRequirements(nodePool, + karpv1.NodeSelectorRequirementWithMinValues{ + Key: v1.LabelInstanceTypeStable, Operator: v1.NodeSelectorOpIn, Values: []string{sku.GetName()}}, + karpv1.NodeSelectorRequirementWithMinValues{ + Key: karpv1.CapacityTypeLabelKey, Operator: v1.NodeSelectorOpIn, Values: []string{capacityType}}, + ) + ExpectApplied(ctx, env.Client, nodeClass, nodePool) + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectNotScheduled(ctx, env.Client, pod) + for _, zoneID := range []string{"1", "2", "3"} { + ExpectUnavailable(azureEnv, sku, zones.MakeAKSLabelZoneFromARMZone(fake.Region, zoneID), capacityType) } - Expect(nodeNames.Len()).To(Equal(2)) - }) + } - // Ported from VM test: "should launch instances on later reconciliation attempt with Insufficient Capacity Error Cache expiry" - Context("should launch instances on later reconciliation attempt with Insufficient Capacity Error Cache expiry", func() { - It("should launch instances on later reconciliation attempt with Insufficient Capacity Error Cache expiry - zonal", func() { - for _, zone := range azureEnv.Zones() { - azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_D2_v2"), zone, karpv1.CapacityTypeSpot) - azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_D2_v2"), zone, karpv1.CapacityTypeOnDemand) - } - azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_D2_v2"), zones.Regional, karpv1.CapacityTypeSpot) - azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_D2_v2"), zones.Regional, karpv1.CapacityTypeOnDemand) - - ExpectApplied(ctx, env.Client, nodeClass, nodePool) - pod := coretest.UnschedulablePod(coretest.PodOptions{ - NodeSelector: map[string]string{v1.LabelInstanceTypeStable: "Standard_D2_v2"}, - }) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectNotScheduled(ctx, env.Client, pod) - - // capacity shortage is over - expire the items from the cache and try again - azureEnv.UnavailableOfferingsCache.Flush() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - node := ExpectScheduled(ctx, env.Client, pod) - Expect(node.Labels).To(HaveKeyWithValue(v1.LabelInstanceTypeStable, "Standard_D2_v2")) - }) - It("should launch instances on later reconciliation attempt with Insufficient Capacity Error Cache expiry - non-zonal", func() { - for _, zone := range azureEnvNonZonal.Zones() { - azureEnvNonZonal.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_D2_v2"), zone, karpv1.CapacityTypeSpot) - azureEnvNonZonal.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_D2_v2"), zone, karpv1.CapacityTypeOnDemand) - } + // Ported from VM test: "should mark SKU as unavailable in all zones for Spot" + It("should mark SKU as unavailable in all zones for Spot", func() { + AssertUnavailable(defaultTestSKU, karpv1.CapacityTypeSpot) + }) - ExpectApplied(ctx, env.Client, nodeClass, nodePool) - pod := coretest.UnschedulablePod(coretest.PodOptions{ - NodeSelector: map[string]string{v1.LabelInstanceTypeStable: "Standard_D2_v2"}, - }) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, clusterNonZonal, cloudProviderNonZonal, coreProvisionerNonZonal, azureEnvNonZonal, pod) - ExpectNotScheduled(ctx, env.Client, pod) - - // capacity shortage is over - expire the items from the cache and try again - azureEnvNonZonal.UnavailableOfferingsCache.Flush() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, clusterNonZonal, cloudProviderNonZonal, coreProvisionerNonZonal, azureEnvNonZonal, pod) - node := ExpectScheduled(ctx, env.Client, pod) - Expect(node.Labels).To(HaveKeyWithValue(v1.LabelInstanceTypeStable, "Standard_D2_v2")) - }) + // Ported from VM test: "should mark SKU as unavailable in all zones for OnDemand" + It("should mark SKU as unavailable in all zones for OnDemand", func() { + AssertUnavailable(defaultTestSKU, karpv1.CapacityTypeOnDemand) }) + }) - // Ported from VM test context: "SkuNotAvailable" - Context("SKUNotAvailable", func() { - AssertUnavailable := func(sku *skewer.SKU, capacityType string) { - // Simulate SKU not available error via AKS Machine API - azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorSkuNotAvailable(sku.GetName(), fake.Region) - - coretest.ReplaceRequirements(nodePool, - karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1.LabelInstanceTypeStable, Operator: v1.NodeSelectorOpIn, Values: []string{sku.GetName()}}, - karpv1.NodeSelectorRequirementWithMinValues{ - Key: karpv1.CapacityTypeLabelKey, Operator: v1.NodeSelectorOpIn, Values: []string{capacityType}}, - ) - ExpectApplied(ctx, env.Client, nodeClass, nodePool) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectNotScheduled(ctx, env.Client, pod) - for _, zoneID := range []string{"1", "2", "3"} { - ExpectUnavailable(azureEnv, sku, zones.MakeAKSLabelZoneFromARMZone(fake.Region, zoneID), capacityType) - } - } + // This is from AKS RP frontend errors rather then CRP + Context("SKUNotAvailable - AKS Machine API sync phase", func() { + AssertUnavailableSync := func(syncErr *azcore.ResponseError, sku *skewer.SKU, capacityType string) { + azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.BeginError.Set(syncErr) - // Ported from VM test: "should mark SKU as unavailable in all zones for Spot" - It("should mark SKU as unavailable in all zones for Spot", func() { - AssertUnavailable(defaultTestSKU, karpv1.CapacityTypeSpot) - }) + coretest.ReplaceRequirements(nodePool, + karpv1.NodeSelectorRequirementWithMinValues{ + Key: v1.LabelInstanceTypeStable, Operator: v1.NodeSelectorOpIn, Values: []string{sku.GetName()}}, + karpv1.NodeSelectorRequirementWithMinValues{ + Key: karpv1.CapacityTypeLabelKey, Operator: v1.NodeSelectorOpIn, Values: []string{capacityType}}, + ) + ExpectApplied(ctx, env.Client, nodeClass, nodePool) + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectNotScheduled(ctx, env.Client, pod) + for _, zoneID := range []string{"1", "2", "3"} { + ExpectUnavailable(azureEnv, sku, zones.MakeAKSLabelZoneFromARMZone(fake.Region, zoneID), capacityType) + } + } - // Ported from VM test: "should mark SKU as unavailable in all zones for OnDemand" - It("should mark SKU as unavailable in all zones for OnDemand", func() { - AssertUnavailable(defaultTestSKU, karpv1.CapacityTypeOnDemand) - }) + It("should handle VMSizeNotSupported sync error and mark SKU unavailable", func() { + AssertUnavailableSync( + fake.AKSMachineAPIErrorVMSizeNotSupported(lo.FromPtr(defaultTestSKU.Name), azureEnv.SubscriptionID, fake.Region), + defaultTestSKU, karpv1.CapacityTypeOnDemand, + ) }) - // This is from AKS RP frontend errors rather then CRP - Context("SKUNotAvailable - AKS Machine API sync phase", func() { - AssertUnavailableSync := func(syncErr *azcore.ResponseError, sku *skewer.SKU, capacityType string) { - azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.BeginError.Set(syncErr) - - coretest.ReplaceRequirements(nodePool, - karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1.LabelInstanceTypeStable, Operator: v1.NodeSelectorOpIn, Values: []string{sku.GetName()}}, - karpv1.NodeSelectorRequirementWithMinValues{ - Key: karpv1.CapacityTypeLabelKey, Operator: v1.NodeSelectorOpIn, Values: []string{capacityType}}, - ) - ExpectApplied(ctx, env.Client, nodeClass, nodePool) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectNotScheduled(ctx, env.Client, pod) - for _, zoneID := range []string{"1", "2", "3"} { - ExpectUnavailable(azureEnv, sku, zones.MakeAKSLabelZoneFromARMZone(fake.Region, zoneID), capacityType) - } - } + It("should handle BadRequest 'not supported for subscription' sync error and mark SKU unavailable", func() { + AssertUnavailableSync( + fake.AKSMachineAPIErrorVMSizeNotSupportedBadRequest(lo.FromPtr(defaultTestSKU.Name), azureEnv.SubscriptionID, fake.Region), + defaultTestSKU, karpv1.CapacityTypeSpot, + ) + }) + }) + }) - It("should handle VMSizeNotSupported sync error and mark SKU unavailable", func() { - AssertUnavailableSync( - fake.AKSMachineAPIErrorVMSizeNotSupported(lo.FromPtr(defaultTestSKU.Name), azureEnv.SubscriptionID, fake.Region), - defaultTestSKU, karpv1.CapacityTypeOnDemand, - ) - }) +} - It("should handle BadRequest 'not supported for subscription' sync error and mark SKU unavailable", func() { - AssertUnavailableSync( - fake.AKSMachineAPIErrorVMSizeNotSupportedBadRequest(lo.FromPtr(defaultTestSKU.Name), azureEnv.SubscriptionID, fake.Region), - defaultTestSKU, karpv1.CapacityTypeSpot, - ) - }) +var _ = Describe("CloudProvider", func() { + Context("ProvisionMode = AKSMachineAPIHeaderBatch", func() { + BeforeEach(func() { + testOptions = test.Options(test.OptionsFields{ + ProvisionMode: lo.ToPtr(consts.ProvisionModeAKSMachineAPIHeaderBatch), + UseSIG: lo.ToPtr(true), }) + + ctx = coreoptions.ToContext(ctx, coretest.Options()) + ctx = options.ToContext(ctx, testOptions) + + azureEnv = test.NewEnvironment(ctx, env) + azureEnvNonZonal = test.NewEnvironmentNonZonal(ctx, env) + statusController = status.NewController(env.Client, azureEnv.KubernetesVersionProvider, azureEnv.ImageProvider, env.KubernetesInterface, env.KubernetesInterface, azureEnv.DynamicInterface, azureEnv.SubnetsAPI, azureEnv.DiskEncryptionSetsAPI, testOptions.ParsedDiskEncryptionSetID, options.FromContext(ctx).NetworkPolicy, options.FromContext(ctx).NetworkPlugin) + test.ApplyDefaultStatus(nodeClass, env, testOptions.UseSIG) + cloudProvider = New(azureEnv.InstanceTypesProvider, azureEnv.VMInstanceProvider, azureEnv.AKSMachineProvider, recorder, env.Client, azureEnv.ImageProvider, azureEnv.InstanceTypeStore) + cloudProviderNonZonal = New(azureEnvNonZonal.InstanceTypesProvider, azureEnvNonZonal.VMInstanceProvider, azureEnvNonZonal.AKSMachineProvider, events.NewRecorder(&record.FakeRecorder{}), env.Client, azureEnvNonZonal.ImageProvider, azureEnvNonZonal.InstanceTypeStore) + + cluster = state.NewCluster(fakeClock, env.Client, cloudProvider) + clusterNonZonal = state.NewCluster(fakeClock, env.Client, cloudProviderNonZonal) + coreProvisioner = provisioning.NewProvisioner(env.Client, recorder, cloudProvider, cluster, fakeClock) + coreProvisionerNonZonal = provisioning.NewProvisioner(env.Client, recorder, cloudProviderNonZonal, clusterNonZonal, fakeClock) + + ExpectApplied(ctx, env.Client, nodeClass, nodePool) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + }) + + AfterEach(func() { + // Wait for any async polling goroutines to complete before resetting + cloudProvider.WaitForInstancePromises() + cluster.Reset() + azureEnv.Reset(ctx) + azureEnvNonZonal.Reset(ctx) }) + runAKSMachineAPIOfferingTests() }) }) From 5c046e3adc2a9056d1c45875bf2f94ab05551d72 Mon Sep 17 00:00:00 2001 From: Robin Deeboonchai Date: Mon, 8 Jun 2026 21:35:55 -0700 Subject: [PATCH 2/4] test: reunify tests + misc setup fixes from reunification --- .vscode/settings.json | 2 +- pkg/cloudprovider/suite_drift_test.go | 407 +-- pkg/cloudprovider/suite_features_test.go | 3196 +++++++++++++------ pkg/cloudprovider/suite_integration_test.go | 888 ++++-- pkg/cloudprovider/suite_modes_test.go | 81 - pkg/cloudprovider/suite_offerings_test.go | 1179 ++++--- pkg/cloudprovider/suite_test.go | 424 +-- pkg/providers/instancetype/suite_test.go | 2867 +++-------------- 8 files changed, 4233 insertions(+), 4811 deletions(-) delete mode 100644 pkg/cloudprovider/suite_modes_test.go diff --git a/.vscode/settings.json b/.vscode/settings.json index 9f146249c..fc81bd174 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,5 +1,5 @@ { - "go.testTimeout": "420s", + "go.testTimeout": "900s", "gopls": { "build.directoryFilters": [ "-hack" diff --git a/pkg/cloudprovider/suite_drift_test.go b/pkg/cloudprovider/suite_drift_test.go index cfb17a39c..3731c818a 100644 --- a/pkg/cloudprovider/suite_drift_test.go +++ b/pkg/cloudprovider/suite_drift_test.go @@ -44,41 +44,58 @@ import ( "github.com/Azure/karpenter-provider-azure/pkg/test" ) -func runAKSMachineAPIDriftTests() { +func runDriftTests(provisionMode provisionModeTestCase) { Context("Drift", func() { var nodeClaim *karpv1.NodeClaim - var node *v1.Node - var createInput *fake.AKSMachineCreateOrUpdateInput + var providerInstanceName string BeforeEach(func() { instanceType := "Standard_D2_v2" ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectNodeClassHashUpdated(ctx, env.Client, nodeClass) + pod := coretest.UnschedulablePod(coretest.PodOptions{ NodeSelector: map[string]string{v1.LabelInstanceTypeStable: instanceType}, }) ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - node = ExpectScheduled(ctx, env.Client, pod) - // KubeletVersion must be applied to the node to satisfy k8s drift + node := ExpectScheduled(ctx, env.Client, pod) if nodeClass.Status.KubernetesVersion != nil { node.Status.NodeInfo.KubeletVersion = "v" + *nodeClass.Status.KubernetesVersion } - node.Labels[v1beta1.AKSLabelKubeletIdentityClientID] = "61f71907-753f-4802-a901-47361c3664f2" // random UUID + node.Labels[v1beta1.AKSLabelKubeletIdentityClientID] = "61f71907-753f-4802-a901-47361c3664f2" + + opts := *options.FromContext(ctx) + opts.KubeletIdentityClientID = node.Labels[v1beta1.AKSLabelKubeletIdentityClientID] + ctx = options.ToContext(ctx, &opts) ExpectApplied(ctx, env.Client, node) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput = azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + + if provisionMode.isAKSMachineMode() { + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + providerInstanceName = lo.FromPtr(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine.Name) + } else { + Expect(azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + providerInstanceName = lo.FromPtr(&azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VMName) + } nodeClaims, err := cloudProvider.List(ctx) Expect(err).ToNot(HaveOccurred()) Expect(nodeClaims).To(HaveLen(1)) - nodeClaim = nodeClaims[0] - nodeClaim.Status.NodeName = node.Name // Normally core would do this. + listedNodeClaim := nodeClaims[0] + Expect(node.Spec.ProviderID).ToNot(BeEmpty()) + nodeClaim = &karpv1.NodeClaim{} + Expect(env.Client.Get(ctx, types.NamespacedName{Name: listedNodeClaim.Name}, nodeClaim)).To(Succeed()) + nodeClaim.Status = listedNodeClaim.Status + nodeClaim.Status.ProviderID = node.Spec.ProviderID + nodeClaim.Status.NodeName = node.Name nodeClaim.Spec.NodeClassRef = &karpv1.NodeClassReference{ Group: object.GVK(nodeClass).Group, Kind: object.GVK(nodeClass).Kind, Name: nodeClass.Name, } + ExpectApplied(ctx, env.Client, nodeClaim) }) It("should not fail if nodeClass does not exist", func() { @@ -108,36 +125,6 @@ func runAKSMachineAPIDriftTests() { Expect(drifted).To(BeEmpty()) }) - Context("Node Image Drift", func() { - It("should trigger drift when DriftAction field is available", func() { - // Find the AKS machine that was created during BeforeEach - aksMachineID := fake.MkMachineID(testOptions.NodeResourceGroup, testOptions.ClusterName, testOptions.AKSMachinesPoolName, createInput.AKSMachineName) - - // Get the existing machine from the fake store - existingMachine, ok := azureEnv.AKSDataStorage.AKSMachines.Load(aksMachineID) - Expect(ok).To(BeTrue(), "AKS machine should exist in fake store") - - aksMachine := existingMachine - - // Set DriftAction to "Recreate" to trigger drift - if aksMachine.Properties == nil { - aksMachine.Properties = &armcontainerservice.MachineProperties{} - } - if aksMachine.Properties.Status == nil { - aksMachine.Properties.Status = &armcontainerservice.MachineStatus{} - } - aksMachine.Properties.Status.DriftAction = lo.ToPtr(armcontainerservice.DriftActionRecreate) - aksMachine.Properties.Status.DriftReason = lo.ToPtr("ClusterConfigurationChanged") - - // Update the machine in the fake store - azureEnv.AKSDataStorage.AKSMachines.Store(aksMachineID, aksMachine) - - drifted, err := cloudProvider.IsDrifted(ctx, nodeClaim) - Expect(err).ToNot(HaveOccurred()) - Expect(drifted).To(Equal(ClusterConfigDrift)) - }) - }) - Context("Node Image Drift", func() { It("should succeed with no drift when nothing changes", func() { drifted, err := cloudProvider.IsDrifted(ctx, nodeClaim) @@ -154,7 +141,7 @@ func runAKSMachineAPIDriftTests() { Expect(drifted).To(Equal(NoDrift)) }) - // Note: this case shouldn't be able to happen in practice since if Images is empty ConditionTypeImagesReady should be false. + // Empty Images should normally make ImagesReady false before drift reaches this branch. It("should error when Images are empty", func() { nodeClass = ExpectExists(ctx, env.Client, nodeClass) nodeClass.Status.Images = []v1beta1.NodeImage{} @@ -171,6 +158,17 @@ func runAKSMachineAPIDriftTests() { Expect(err).ToNot(HaveOccurred()) Expect(drifted).To(Equal(ImageDrift)) }) + + // Machine API mode never support CIG + if !provisionMode.isAKSMachineMode() { + It("should trigger drift when the image gallery changes to SIG", func() { + test.ApplySIGImages(nodeClass) + ExpectApplied(ctx, env.Client, nodeClass) + drifted, err := cloudProvider.IsDrifted(ctx, nodeClaim) + Expect(err).ToNot(HaveOccurred()) + Expect(drifted).To(Equal(ImageDrift)) + }) + } }) Context("Kubernetes Version", func() { @@ -214,7 +212,7 @@ func runAKSMachineAPIDriftTests() { }) It("shouldn't error or be drifted when node is deleting", func() { - node = ExpectNodeExists(ctx, env.Client, nodeClaim.Status.NodeName) + node := ExpectNodeExists(ctx, env.Client, nodeClaim.Status.NodeName) node.Finalizers = append(node.Finalizers, test.TestingFinalizer) ExpectApplied(ctx, env.Client, node) Expect(env.Client.Delete(ctx, node)).ToNot(HaveOccurred()) @@ -222,7 +220,6 @@ func runAKSMachineAPIDriftTests() { Expect(err).ToNot(HaveOccurred()) Expect(drifted).To(Equal(NoDrift)) - // cleanup node = ExpectNodeExists(ctx, env.Client, nodeClaim.Status.NodeName) deepCopy := node.DeepCopy() node.Finalizers = lo.Reject(node.Finalizers, func(finalizer string, _ int) bool { @@ -246,6 +243,80 @@ func runAKSMachineAPIDriftTests() { Expect(drifted).To(Equal(K8sVersionDrift)) }) }) + + Context("Static fields", func() { + It("should trigger drift if NodeClass subnet changed", func() { + testSubnetID := "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resourceGroup/providers/Microsoft.Network/virtualNetworks/aks-vnet-12345678/subnets/my-subnet" + nodeClass.Spec.VNETSubnetID = lo.ToPtr(testSubnetID) + ExpectApplied(ctx, env.Client, nodeClass) + ExpectNodeClassHashUpdated(ctx, env.Client, nodeClass) + + drifted, err := cloudProvider.IsDrifted(ctx, nodeClaim) + Expect(err).ToNot(HaveOccurred()) + Expect(drifted).To(Equal(NodeClassDrift)) + }) + + It("should trigger drift if ImageFamily changed", func() { + nodeClass.Spec.ImageFamily = lo.ToPtr(v1beta1.AzureLinuxImageFamily) + ExpectApplied(ctx, env.Client, nodeClass) + ExpectNodeClassHashUpdated(ctx, env.Client, nodeClass) + + drifted, err := cloudProvider.IsDrifted(ctx, nodeClaim) + Expect(err).ToNot(HaveOccurred()) + Expect(drifted).To(Equal(NodeClassDrift)) + }) + }) + + // DriftAction is Machine API specific design. + if provisionMode.isAKSMachineMode() { + Context("AKS Machine DriftAction", func() { + It("should trigger drift when DriftAction field is available", func() { + aksMachineID := fake.MkMachineID(testOptions.NodeResourceGroup, testOptions.ClusterName, testOptions.AKSMachinesPoolName, providerInstanceName) + existingMachine, ok := azureEnv.AKSDataStorage.AKSMachines.Load(aksMachineID) + Expect(ok).To(BeTrue(), "AKS machine should exist in fake store") + + aksMachine := existingMachine + if aksMachine.Properties == nil { + aksMachine.Properties = &armcontainerservice.MachineProperties{} + } + if aksMachine.Properties.Status == nil { + aksMachine.Properties.Status = &armcontainerservice.MachineStatus{} + } + aksMachine.Properties.Status.DriftAction = lo.ToPtr(armcontainerservice.DriftActionRecreate) + aksMachine.Properties.Status.DriftReason = lo.ToPtr("ClusterConfigurationChanged") + azureEnv.AKSDataStorage.AKSMachines.Store(aksMachineID, aksMachine) + + drifted, err := cloudProvider.IsDrifted(ctx, nodeClaim) + Expect(err).ToNot(HaveOccurred()) + Expect(drifted).To(Equal(ClusterConfigDrift)) + }) + }) + } + + // For Machine API modes, Kubelet Client ID drift is handled by Machine API. + if !provisionMode.isAKSMachineMode() { + Context("Kubelet Client ID", func() { + It("should NOT trigger drift if node doesn't have kubelet client ID label", func() { + node := ExpectNodeExists(ctx, env.Client, nodeClaim.Status.NodeName) + node.Labels[v1beta1.AKSLabelKubeletIdentityClientID] = "" + ExpectApplied(ctx, env.Client, node) + + drifted, err := cloudProvider.IsDrifted(ctx, nodeClaim) + Expect(err).ToNot(HaveOccurred()) + Expect(drifted).To(BeEmpty()) + }) + + It("should trigger drift if node kubelet client ID doesn't match options", func() { + opts := *options.FromContext(ctx) + opts.KubeletIdentityClientID = "3824ff7a-93b6-40af-b861-2eb621ba437a" + ctx = options.ToContext(ctx, &opts) + + drifted, err := cloudProvider.IsDrifted(ctx, nodeClaim) + Expect(err).ToNot(HaveOccurred()) + Expect(drifted).To(Equal(KubeletIdentityDrift)) + }) + }) + } }) } @@ -284,12 +355,9 @@ var _ = Describe("CloudProvider", func() { azureEnvNonZonal.Reset(ctx) }) - runAKSMachineAPIDriftTests() + runDriftTests(aksMachineAPIHeaderBatchProvisionMode()) }) - // Attention: tests under "ProvisionMode = AKSScriptless" are not applicable to ProvisionMode = AKSMachineAPI option. - // Due to different assumptions, not all tests can be shared. Add tests for AKS machine instances in a different Context/file. - // If ProvisionMode = AKSScriptless is no longer supported, their code/tests will be replaced with ProvisionMode = AKSMachineAPI. Context("ProvisionMode = AKSScriptless", func() { BeforeEach(func() { testOptions = test.Options(test.OptionsFields{ @@ -313,249 +381,6 @@ var _ = Describe("CloudProvider", func() { azureEnv.Reset(ctx) }) - Context("Drift", func() { - var driftNodeClaim *karpv1.NodeClaim - var pod *v1.Pod - var node *v1.Node - - BeforeEach(func() { - // Set up VM provisioning mode environment for drift testing - testOptions = test.Options() - ctx = coreoptions.ToContext(ctx, coretest.Options()) - ctx = options.ToContext(ctx, testOptions) - azureEnv = test.NewEnvironment(ctx, env) - test.ApplyDefaultStatus(nodeClass, env, testOptions.UseSIG) - cloudProvider = New(azureEnv.InstanceTypesProvider, azureEnv.VMInstanceProvider, azureEnv.AKSMachineProvider, recorder, env.Client, azureEnv.ImageProvider, azureEnv.InstanceTypeStore) - cluster = state.NewCluster(fakeClock, env.Client, cloudProvider) - coreProvisioner = provisioning.NewProvisioner(env.Client, recorder, cloudProvider, cluster, fakeClock) - - instanceType := "Standard_D2_v2" - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - ExpectNodeClassHashUpdated(ctx, env.Client, nodeClass) - pod = coretest.UnschedulablePod(coretest.PodOptions{ - NodeSelector: map[string]string{v1.LabelInstanceTypeStable: instanceType}, - }) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - node = ExpectScheduled(ctx, env.Client, pod) - // KubeletVersion must be applied to the node to satisfy k8s drift - if nodeClass.Status.KubernetesVersion != nil { - node.Status.NodeInfo.KubeletVersion = "v" + *nodeClass.Status.KubernetesVersion - } - - node.Labels[v1beta1.AKSLabelKubeletIdentityClientID] = "61f71907-753f-4802-a901-47361c3664f2" // random UUID - // Context must have same kubelet client id - ctx = options.ToContext(ctx, test.Options(test.OptionsFields{ - KubeletIdentityClientID: lo.ToPtr(node.Labels[v1beta1.AKSLabelKubeletIdentityClientID]), - })) - - ExpectApplied(ctx, env.Client, node) - Expect(azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - input := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - - // Corresponding NodeClaim - nodeClaimName := GetNodeClaimNameFromVMName(input.VMName) - driftNodeClaim = &karpv1.NodeClaim{} - Expect(env.Client.Get(ctx, types.NamespacedName{Name: nodeClaimName}, driftNodeClaim)).To(Succeed()) - // ExpectProvisioned doesn't set Status.NodeName -- can be removed once https://github.com/kubernetes-sigs/karpenter/pull/2877 merges - // and we've updated to depend on a version that includes that change (1.9.x?) - driftNodeClaim.Status.NodeName = node.Name - ExpectApplied(ctx, env.Client, driftNodeClaim) - }) - - It("should not fail if nodeClass does not exist", func() { - ExpectDeleted(ctx, env.Client, nodeClass) - drifted, err := cloudProvider.IsDrifted(ctx, driftNodeClaim) - Expect(err).ToNot(HaveOccurred()) - Expect(drifted).To(BeEmpty()) - }) - - It("should not fail if nodePool does not exist", func() { - ExpectDeleted(ctx, env.Client, nodePool) - drifted, err := cloudProvider.IsDrifted(ctx, driftNodeClaim) - Expect(err).ToNot(HaveOccurred()) - Expect(drifted).To(BeEmpty()) - }) - - It("should not return drifted if the NodeClaim is valid", func() { - drifted, err := cloudProvider.IsDrifted(ctx, driftNodeClaim) - Expect(err).ToNot(HaveOccurred()) - Expect(drifted).To(BeEmpty()) - }) - - It("should error drift if NodeClaim doesn't have provider id", func() { - driftNodeClaim.Status = karpv1.NodeClaimStatus{} - drifted, err := cloudProvider.IsDrifted(ctx, driftNodeClaim) - Expect(err).To(HaveOccurred()) - Expect(drifted).To(BeEmpty()) - }) - - Context("Node Image Drift", func() { - It("should succeed with no drift when nothing changes", func() { - drifted, err := cloudProvider.IsDrifted(ctx, driftNodeClaim) - Expect(err).ToNot(HaveOccurred()) - Expect(drifted).To(Equal(NoDrift)) - }) - - It("should succeed with no drift when ConditionTypeImagesReady is not true", func() { - nodeClass = ExpectExists(ctx, env.Client, nodeClass) - nodeClass.StatusConditions().SetFalse(v1beta1.ConditionTypeImagesReady, "ImagesNoLongerReady", "test when images aren't ready") - ExpectApplied(ctx, env.Client, nodeClass) - drifted, err := cloudProvider.IsDrifted(ctx, driftNodeClaim) - Expect(err).ToNot(HaveOccurred()) - Expect(drifted).To(Equal(NoDrift)) - }) - - // Note: this case shouldn't be able to happen in practice since if Images is empty ConditionTypeImagesReady should be false. - It("should error when Images are empty", func() { - nodeClass = ExpectExists(ctx, env.Client, nodeClass) - nodeClass.Status.Images = []v1beta1.NodeImage{} - ExpectApplied(ctx, env.Client, nodeClass) - drifted, err := cloudProvider.IsDrifted(ctx, driftNodeClaim) - Expect(err).To(HaveOccurred()) - Expect(drifted).To(Equal(NoDrift)) - }) - - It("should trigger drift when the image gallery changes to SIG", func() { - test.ApplySIGImages(nodeClass) - ExpectApplied(ctx, env.Client, nodeClass) - drifted, err := cloudProvider.IsDrifted(ctx, driftNodeClaim) - Expect(err).ToNot(HaveOccurred()) - Expect(drifted).To(Equal(ImageDrift)) - }) - - It("should trigger drift when the image version changes", func() { - test.ApplyCIGImagesWithVersion(nodeClass, "202503.02.0") - ExpectApplied(ctx, env.Client, nodeClass) - drifted, err := cloudProvider.IsDrifted(ctx, driftNodeClaim) - Expect(err).ToNot(HaveOccurred()) - Expect(drifted).To(Equal(ImageDrift)) - }) - }) - - Context("Kubernetes Version", func() { - It("should succeed with no drift when nothing changes", func() { - drifted, err := cloudProvider.IsDrifted(ctx, driftNodeClaim) - Expect(err).ToNot(HaveOccurred()) - Expect(drifted).To(Equal(NoDrift)) - }) - - It("should succeed with no drift when KubernetesVersionReady is not true", func() { - nodeClass = ExpectExists(ctx, env.Client, nodeClass) - nodeClass.StatusConditions().SetFalse(v1beta1.ConditionTypeKubernetesVersionReady, "K8sVersionNoLongerReady", "test when k8s isn't ready") - ExpectApplied(ctx, env.Client, nodeClass) - drifted, err := cloudProvider.IsDrifted(ctx, driftNodeClaim) - Expect(err).ToNot(HaveOccurred()) - Expect(drifted).To(Equal(NoDrift)) - }) - - // TODO (charliedmcb): I'm wondering if we actually want to have these soft-error cases switch to return an error if no-drift condition was found. - It("shouldn't error or be drifted when KubernetesVersion is empty", func() { - nodeClass = ExpectExists(ctx, env.Client, nodeClass) - nodeClass.Status.KubernetesVersion = lo.ToPtr("") - ExpectApplied(ctx, env.Client, nodeClass) - drifted, err := cloudProvider.IsDrifted(ctx, driftNodeClaim) - Expect(err).ToNot(HaveOccurred()) - Expect(drifted).To(Equal(NoDrift)) - }) - - It("shouldn't error or be drifted when NodeName is missing", func() { - driftNodeClaim.Status.NodeName = "" - drifted, err := cloudProvider.IsDrifted(ctx, driftNodeClaim) - Expect(err).ToNot(HaveOccurred()) - Expect(drifted).To(Equal(NoDrift)) - }) - - It("shouldn't error or be drifted when node is not found", func() { - driftNodeClaim.Status.NodeName = "NodeWhoDoesNotExist" - drifted, err := cloudProvider.IsDrifted(ctx, driftNodeClaim) - Expect(err).ToNot(HaveOccurred()) - Expect(drifted).To(Equal(NoDrift)) - }) - - It("shouldn't error or be drifted when node is deleting", func() { - node = ExpectNodeExists(ctx, env.Client, driftNodeClaim.Status.NodeName) - node.Finalizers = append(node.Finalizers, test.TestingFinalizer) - ExpectApplied(ctx, env.Client, node) - Expect(env.Client.Delete(ctx, node)).ToNot(HaveOccurred()) - drifted, err := cloudProvider.IsDrifted(ctx, driftNodeClaim) - Expect(err).ToNot(HaveOccurred()) - Expect(drifted).To(Equal(NoDrift)) - - // cleanup - node = ExpectNodeExists(ctx, env.Client, driftNodeClaim.Status.NodeName) - deepCopy := node.DeepCopy() - node.Finalizers = lo.Reject(node.Finalizers, func(finalizer string, _ int) bool { - return finalizer == test.TestingFinalizer - }) - Expect(env.Client.Patch(ctx, node, client.StrategicMergeFrom(deepCopy))).NotTo(HaveOccurred()) - ExpectDeleted(ctx, env.Client, node) - }) - - It("should succeed with drift true when KubernetesVersion is new", func() { - nodeClass = ExpectExists(ctx, env.Client, nodeClass) - - semverCurrentK8sVersion := lo.Must(semver.ParseTolerant(*nodeClass.Status.KubernetesVersion)) - semverCurrentK8sVersion.Minor = semverCurrentK8sVersion.Minor + 1 - nodeClass.Status.KubernetesVersion = lo.ToPtr(semverCurrentK8sVersion.String()) - - ExpectApplied(ctx, env.Client, nodeClass) - - drifted, err := cloudProvider.IsDrifted(ctx, driftNodeClaim) - Expect(err).ToNot(HaveOccurred()) - Expect(drifted).To(Equal(K8sVersionDrift)) - }) - }) - - Context("Kubelet Client ID", func() { - It("should NOT trigger drift if node doesn't have kubelet client ID label", func() { - node.Labels[v1beta1.AKSLabelKubeletIdentityClientID] = "" // Not set - - drifted, err := cloudProvider.IsDrifted(ctx, driftNodeClaim) - Expect(err).ToNot(HaveOccurred()) - Expect(drifted).To(BeEmpty()) - }) - - It("should trigger drift if node kubelet client ID doesn't match options", func() { - ctx = options.ToContext(ctx, test.Options(test.OptionsFields{ - KubeletIdentityClientID: lo.ToPtr("3824ff7a-93b6-40af-b861-2eb621ba437a"), // a different random UUID - })) - - drifted, err := cloudProvider.IsDrifted(ctx, driftNodeClaim) - Expect(err).ToNot(HaveOccurred()) - Expect(drifted).To(Equal(KubeletIdentityDrift)) - }) - }) - - Context("Static fields", func() { - It("should not trigger drift if NodeClass hasn't changed", func() { - drifted, err := cloudProvider.IsDrifted(ctx, driftNodeClaim) - Expect(err).ToNot(HaveOccurred()) - Expect(drifted).To(BeEmpty()) - }) - - It("should trigger drift if NodeClass subnet changed", func() { - testSubnetID := "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resourceGroup/providers/Microsoft.Network/virtualNetworks/aks-vnet-12345678/subnets/my-subnet" - nodeClass.Spec.VNETSubnetID = lo.ToPtr(testSubnetID) - ExpectApplied(ctx, env.Client, nodeClass) - ExpectNodeClassHashUpdated(ctx, env.Client, nodeClass) - - drifted, err := cloudProvider.IsDrifted(ctx, driftNodeClaim) - Expect(err).ToNot(HaveOccurred()) - Expect(drifted).To(Equal(NodeClassDrift)) - }) - - It("should trigger drift if ImageFamily changed", func() { - nodeClass.Spec.ImageFamily = lo.ToPtr(v1beta1.AzureLinuxImageFamily) - ExpectApplied(ctx, env.Client, nodeClass) - ExpectNodeClassHashUpdated(ctx, env.Client, nodeClass) - - drifted, err := cloudProvider.IsDrifted(ctx, driftNodeClaim) - Expect(err).ToNot(HaveOccurred()) - Expect(drifted).To(Equal(NodeClassDrift)) - }) - }) - - }) + runDriftTests(aksscriptlessProvisionMode()) }) }) diff --git a/pkg/cloudprovider/suite_features_test.go b/pkg/cloudprovider/suite_features_test.go index d277a8f3e..d9a85bea5 100644 --- a/pkg/cloudprovider/suite_features_test.go +++ b/pkg/cloudprovider/suite_features_test.go @@ -17,15 +17,22 @@ limitations under the License. package cloudprovider import ( + "encoding/base64" "fmt" + "strconv" + "strings" + "time" + armcompute "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7" . "github.com/Azure/karpenter-provider-azure/pkg/test/expectations" + "github.com/blang/semver/v4" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" "github.com/samber/lo" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/sets" "k8s.io/client-go/tools/record" "sigs.k8s.io/controller-runtime/pkg/client" karpv1 "sigs.k8s.io/karpenter/pkg/apis/v1" @@ -40,186 +47,46 @@ import ( "github.com/Azure/karpenter-provider-azure/pkg/apis/v1beta1" "github.com/Azure/karpenter-provider-azure/pkg/consts" "github.com/Azure/karpenter-provider-azure/pkg/controllers/nodeclass/status" + "github.com/Azure/karpenter-provider-azure/pkg/fake" "github.com/Azure/karpenter-provider-azure/pkg/operator/options" "github.com/Azure/karpenter-provider-azure/pkg/providers/imagefamily" + "github.com/Azure/karpenter-provider-azure/pkg/providers/imagefamily/bootstrap" + "github.com/Azure/karpenter-provider-azure/pkg/providers/instance" + "github.com/Azure/karpenter-provider-azure/pkg/providers/labels" + "github.com/Azure/karpenter-provider-azure/pkg/providers/loadbalancer" "github.com/Azure/karpenter-provider-azure/pkg/test" "github.com/Azure/karpenter-provider-azure/pkg/utils" + nodeclaimutils "github.com/Azure/karpenter-provider-azure/pkg/utils/nodeclaim" ) -func runAKSMachineAPIFeatureTests() { - // Mostly ported from VM test: "ImageReference" and "ImageProvider + Image Family" - // Note: AKS Machine API does not support Community Image Gallery (CIG) - Context("Create - ImageReference and ImageProvider + Image Family", func() { - - // Ported from VM test: "should use shared image gallery images when options are set to UseSIG" - It("should use shared image gallery images", func() { - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - - // Expect AKS machine to have a shared image gallery reference set via NodeImageVersion - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine - Expect(aksMachine.Properties.NodeImageVersion).ToNot(BeNil()) - - // NodeImageVersion should contain SIG identifier and subscription ID (converted from ImageReference.ID) - nodeImageVersion := lo.FromPtr(aksMachine.Properties.NodeImageVersion) - Expect(nodeImageVersion).To(ContainSubstring("AKSUbuntu")) - Expect(nodeImageVersion).To(MatchRegexp(`^AKSUbuntu-.*-.*$`)) // Format: AKSUbuntu-- - - // Clean up - cluster.Reset() - azureEnv.Reset(ctx) - }) - - // Note: Community Images tests are not ported since Community Images are not supported for AKS Machine API - // This aligns with the warning in utils.GetAKSMachineNodeImageVersionFromImageID() - - // Ported from VM test DescribeTable: "should select the right Shared Image Gallery image for a given instance type" - DescribeTable("should select the right Shared Image Gallery NodeImageVersion for a given instance type", - func(instanceType string, imageFamily string, expectedImageDefinition string) { - nodeClass.Spec.ImageFamily = lo.ToPtr(imageFamily) - coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1.LabelInstanceTypeStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{instanceType}}) - - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine - Expect(aksMachine.Properties.NodeImageVersion).ToNot(BeNil()) - - // NodeImageVersion should contain the expected image definition - nodeImageVersion := lo.FromPtr(aksMachine.Properties.NodeImageVersion) - Expect(nodeImageVersion).To(ContainSubstring(expectedImageDefinition)) - }, - // Ported entries from VM test, covering SIG images for different generations and architectures - Entry("Gen2, Gen1 instance type with AKSUbuntu image family", "Standard_D2_v5", v1beta1.Ubuntu2204ImageFamily, imagefamily.Ubuntu2204Gen2ImageDefinition), - Entry("Gen1 instance type with AKSUbuntu image family", "Standard_D2_v3", v1beta1.Ubuntu2204ImageFamily, imagefamily.Ubuntu2204Gen1ImageDefinition), - Entry("ARM instance type with AKSUbuntu image family", "Standard_D16plds_v5", v1beta1.Ubuntu2204ImageFamily, imagefamily.Ubuntu2204Gen2ArmImageDefinition), - ) - - It("should select the right Shared Image Gallery NodeImageVersion for a given instance type, Gen2 instance type with AzureLinux image family", func() { - instanceType := "Standard_D2_v5" - imageFamily := v1beta1.AzureLinuxImageFamily - kubernetesVersion := lo.Must(env.KubernetesInterface.Discovery().ServerVersion()).String() - expectUseAzureLinux3 := imagefamily.UseAzureLinux3(kubernetesVersion) - expectedImageDefinition := lo.Ternary(expectUseAzureLinux3, imagefamily.AzureLinux3Gen2ImageDefinition, imagefamily.AzureLinuxGen2ImageDefinition) - - nodeClass.Spec.ImageFamily = lo.ToPtr(imageFamily) - coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1.LabelInstanceTypeStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{instanceType}}) - - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine - Expect(aksMachine.Properties.NodeImageVersion).ToNot(BeNil()) - - // NodeImageVersion should contain the expected image definition - nodeImageVersion := lo.FromPtr(aksMachine.Properties.NodeImageVersion) - Expect(nodeImageVersion).To(ContainSubstring(expectedImageDefinition)) - }) - - It("should select the right Shared Image Gallery NodeImageVersion for a given instance type, Gen1 instance type with AzureLinux image family", func() { - instanceType := "Standard_D2_v3" - imageFamily := v1beta1.AzureLinuxImageFamily - kubernetesVersion := lo.Must(env.KubernetesInterface.Discovery().ServerVersion()).String() - expectUseAzureLinux3 := imagefamily.UseAzureLinux3(kubernetesVersion) - expectedImageDefinition := lo.Ternary(expectUseAzureLinux3, imagefamily.AzureLinux3Gen1ImageDefinition, imagefamily.AzureLinuxGen1ImageDefinition) - - nodeClass.Spec.ImageFamily = lo.ToPtr(imageFamily) - coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1.LabelInstanceTypeStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{instanceType}}) - - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine - Expect(aksMachine.Properties.NodeImageVersion).ToNot(BeNil()) - - // NodeImageVersion should contain the expected image definition - nodeImageVersion := lo.FromPtr(aksMachine.Properties.NodeImageVersion) - Expect(nodeImageVersion).To(ContainSubstring(expectedImageDefinition)) - }) - - It("should select the right Shared Image Gallery NodeImageVersion for a given instance type, ARM instance type with AzureLinux image family", func() { - instanceType := "Standard_D16plds_v5" - imageFamily := v1beta1.AzureLinuxImageFamily - kubernetesVersion := lo.Must(env.KubernetesInterface.Discovery().ServerVersion()).String() - expectUseAzureLinux3 := imagefamily.UseAzureLinux3(kubernetesVersion) - expectedImageDefinition := lo.Ternary(expectUseAzureLinux3, imagefamily.AzureLinux3Gen2ArmImageDefinition, imagefamily.AzureLinuxGen2ArmImageDefinition) - - nodeClass.Spec.ImageFamily = lo.ToPtr(imageFamily) - coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1.LabelInstanceTypeStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{instanceType}}) - - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine - Expect(aksMachine.Properties.NodeImageVersion).ToNot(BeNil()) - - // NodeImageVersion should contain the expected image definition - nodeImageVersion := lo.FromPtr(aksMachine.Properties.NodeImageVersion) - Expect(nodeImageVersion).To(ContainSubstring(expectedImageDefinition)) - - // Clean up - cluster.Reset() - azureEnv.Reset(ctx) - }) - }) - - // Ported from VM test: "GPU Workloads + Nodes" +func runFeatureTests(provisionMode provisionModeTestCase) { Context("Create - GPU Workloads + Nodes", func() { - // Ported from VM test: "should schedule non-GPU pod onto the cheapest non-GPU capable node" It("should schedule non-GPU pod onto the cheapest non-GPU capable node", func() { ExpectApplied(ctx, env.Client, nodePool, nodeClass) pod := coretest.UnschedulablePod(coretest.PodOptions{}) ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) node := ExpectScheduled(ctx, env.Client, pod) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine - Expect(aksMachine.Properties).ToNot(BeNil()) - Expect(aksMachine.Properties.Hardware).ToNot(BeNil()) - Expect(aksMachine.Properties.Hardware.VMSize).ToNot(BeNil()) - Expect(utils.IsNvidiaEnabledSKU(lo.FromPtr(aksMachine.Properties.Hardware.VMSize))).To(BeFalse()) - + if provisionMode.isAKSMachineMode() { + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine + Expect(aksMachine.Properties).ToNot(BeNil()) + Expect(aksMachine.Properties.Hardware).ToNot(BeNil()) + Expect(aksMachine.Properties.Hardware.VMSize).ToNot(BeNil()) + vmSize := lo.FromPtr(aksMachine.Properties.Hardware.VMSize) + Expect(utils.IsNvidiaEnabledSKU(vmSize)).To(BeFalse()) + } else { + Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM + Expect(vm.Properties).ToNot(BeNil()) + Expect(vm.Properties.HardwareProfile).ToNot(BeNil()) + Expect(vm.Properties.HardwareProfile.VMSize).ToNot(BeNil()) + vmSize := string(lo.FromPtr(vm.Properties.HardwareProfile.VMSize)) + Expect(utils.IsNvidiaEnabledSKU(vmSize)).To(BeFalse()) + } Expect(node.Labels).To(HaveKeyWithValue("karpenter.azure.com/sku-gpu-count", "0")) }) - // Ported from VM test: "should schedule GPU pod on GPU capable node" It("should schedule GPU pod on GPU capable node", func() { ExpectApplied(ctx, env.Client, nodePool, nodeClass) pod := coretest.UnschedulablePod(coretest.PodOptions{ @@ -248,21 +115,25 @@ func runAKSMachineAPIFeatureTests() { ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) node := ExpectScheduled(ctx, env.Client, pod) - - // the following checks assume Standard_NC16as_T4_v3 (surprisingly the cheapest GPU in the test set), so test the assumption Expect(node.Labels).To(HaveKeyWithValue("node.kubernetes.io/instance-type", "Standard_NC16as_T4_v3")) - // Verify AKS machine GPU selection - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine - Expect(aksMachine.Properties).ToNot(BeNil()) - Expect(aksMachine.Properties.Hardware).ToNot(BeNil()) - Expect(aksMachine.Properties.Hardware.VMSize).ToNot(BeNil()) - vmSize := lo.FromPtr(aksMachine.Properties.Hardware.VMSize) - Expect(utils.IsNvidiaEnabledSKU(vmSize)).To(BeTrue()) - - // Verify that the node the pod was scheduled on has GPU resource and labels set + if provisionMode.isAKSMachineMode() { + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine + Expect(aksMachine.Properties).ToNot(BeNil()) + Expect(aksMachine.Properties.Hardware).ToNot(BeNil()) + Expect(aksMachine.Properties.Hardware.VMSize).ToNot(BeNil()) + vmSize := lo.FromPtr(aksMachine.Properties.Hardware.VMSize) + Expect(utils.IsNvidiaEnabledSKU(vmSize)).To(BeTrue()) + } else { + Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM + Expect(vm.Properties).ToNot(BeNil()) + Expect(vm.Properties.HardwareProfile).ToNot(BeNil()) + Expect(vm.Properties.HardwareProfile.VMSize).ToNot(BeNil()) + vmSize := string(lo.FromPtr(vm.Properties.HardwareProfile.VMSize)) + Expect(utils.IsNvidiaEnabledSKU(vmSize)).To(BeTrue()) + } Expect(node.Status.Allocatable).To(HaveKeyWithValue(v1.ResourceName("nvidia.com/gpu"), resource.MustParse("1"))) Expect(node.Labels).To(HaveKeyWithValue("karpenter.azure.com/sku-gpu-name", "T4")) Expect(node.Labels).To(HaveKeyWithValue("karpenter.azure.com/sku-gpu-manufacturer", v1beta1.ManufacturerNvidia)) @@ -270,97 +141,162 @@ func runAKSMachineAPIFeatureTests() { }) }) - // Ported from VM test: Context "additional-tags" Context("Create - Additional Tags", func() { - It("should add additional tags to the AKS machine", func() { - // Set up test context with additional tags - aksTestOptions := test.Options(test.OptionsFields{ - ProvisionMode: lo.ToPtr(consts.ProvisionModeAKSMachineAPIHeaderBatch), - UseSIG: lo.ToPtr(true), - AdditionalTags: map[string]string{ - "karpenter.azure.com/test-tag": "test-value", - }, - }) - aksCtx := coreoptions.ToContext(ctx, coretest.Options()) - aksCtx = options.ToContext(aksCtx, aksTestOptions) - - aksAzureEnv := test.NewEnvironment(aksCtx, env) - test.ApplyDefaultStatus(nodeClass, env, aksTestOptions.UseSIG) - aksCloudProvider := New(aksAzureEnv.InstanceTypesProvider, aksAzureEnv.VMInstanceProvider, aksAzureEnv.AKSMachineProvider, recorder, env.Client, aksAzureEnv.ImageProvider, aksAzureEnv.InstanceTypeStore) - aksCluster := state.NewCluster(fakeClock, env.Client, aksCloudProvider) - aksProv := provisioning.NewProvisioner(env.Client, recorder, aksCloudProvider, aksCluster, fakeClock) - - ExpectApplied(aksCtx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(aksCtx, env.Client, aksCluster, aksCloudProvider, aksProv, aksAzureEnv, pod) - ExpectScheduled(aksCtx, env.Client, pod) - - // Verify AKS machine was created with expected tags - Expect(aksAzureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - input := aksAzureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := input.AKSMachine - Expect(aksMachine).ToNot(BeNil()) - Expect(aksMachine.Properties.Tags).To(HaveKey("karpenter.azure.com_test-tag")) - Expect(*aksMachine.Properties.Tags["karpenter.azure.com_test-tag"]).To(Equal("test-value")) - Expect(aksMachine.Properties.Tags).To(HaveKey("karpenter.azure.com_cluster")) - Expect(*aksMachine.Properties.Tags["karpenter.azure.com_cluster"]).To(Equal("test-cluster")) - Expect(aksMachine.Properties.Tags).To(HaveKey("compute.aks.billing")) - Expect(*aksMachine.Properties.Tags["compute.aks.billing"]).To(Equal("linux")) - Expect(aksMachine.Properties.Tags).To(HaveKey("karpenter.sh_nodepool")) - Expect(*aksMachine.Properties.Tags["karpenter.sh_nodepool"]).To(Equal(nodePool.Name)) - Expect(aksMachine.Properties.Tags).To(HaveKey("karpenter.azure.com_aksmachine_nodeclaim")) - - // Clean up - aksCluster.Reset() - aksAzureEnv.Reset(ctx) - }) - }) - - // Mostly ported from VM test: Context "Ephemeral Disk" - // Note: AKS Machine API has simpler disk configuration compared to VM API - // - VMs control detailed StorageProfile, DiffDiskSettings, Placement (NVMe/Cache) - // - AKS machines use OSDiskType (Managed/Ephemeral) and OSDiskSizeGB - // - AKS machines automatically handles placement decisions (NVMe vs Cache disk) - Context("Create - Ephemeral Disk", func() { - // Ported from VM test: "should use ephemeral disk if supported, and has space of at least 128GB by default" - It("should use ephemeral disk if supported, and has space of at least 128GB by default", func() { - // Select a SKU that supports ephemeral disks with sufficient space - nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1.LabelInstanceTypeStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{"Standard_D64s_v3"}, // Has large cache disk space, - }) + It("should add additional tags to the node", func() { + originalOptions := options.FromContext(ctx) + updatedOptions := *originalOptions + updatedOptions.AdditionalTags = map[string]string{"karpenter.azure.com/test-tag": "test-value"} + ctx = options.ToContext(ctx, &updatedOptions) ExpectApplied(ctx, env.Client, nodePool, nodeClass) pod := coretest.UnschedulablePod() ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) ExpectScheduled(ctx, env.Client, pod) - // Verify AKS machine uses ephemeral disk - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine - Expect(aksMachine.Properties.OperatingSystem).ToNot(BeNil()) - Expect(aksMachine.Properties.OperatingSystem.OSDiskType).ToNot(BeNil()) - Expect(*aksMachine.Properties.OperatingSystem.OSDiskType).To(Equal(armcontainerservice.OSDiskTypeEphemeral)) + if provisionMode.isAKSMachineMode() { + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine + Expect(aksMachine).ToNot(BeNil()) + Expect(aksMachine.Properties.Tags).To(HaveKeyWithValue("karpenter.azure.com_test-tag", lo.ToPtr("test-value"))) + Expect(aksMachine.Properties.Tags).To(HaveKeyWithValue("karpenter.azure.com_cluster", lo.ToPtr("test-cluster"))) + Expect(aksMachine.Properties.Tags).To(HaveKeyWithValue("compute.aks.billing", lo.ToPtr("linux"))) + Expect(aksMachine.Properties.Tags).To(HaveKeyWithValue("karpenter.sh_nodepool", lo.ToPtr(nodePool.Name))) + Expect(aksMachine.Properties.Tags).To(HaveKey("karpenter.azure.com_aksmachine_nodeclaim")) + } else { + Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM + Expect(vm).NotTo(BeNil()) + Expect(vm.Tags).To(Equal(map[string]*string{ + "karpenter.azure.com_test-tag": lo.ToPtr("test-value"), + "karpenter.azure.com_cluster": lo.ToPtr("test-cluster"), + "compute.aks.billing": lo.ToPtr("linux"), + "karpenter.sh_nodepool": lo.ToPtr(nodePool.Name), + })) + Expect(azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + nic := azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Pop() + Expect(nic).NotTo(BeNil()) + Expect(nic.Interface.Tags).To(Equal(map[string]*string{ + "karpenter.azure.com_test-tag": lo.ToPtr("test-value"), + "karpenter.azure.com_cluster": lo.ToPtr("test-cluster"), + "compute.aks.billing": lo.ToPtr("linux"), + "karpenter.sh_nodepool": lo.ToPtr(nodePool.Name), + })) + } + }) + }) + + Context("Ephemeral Disk", func() { + var originalOptions *options.Options + BeforeEach(func() { + originalOptions = options.FromContext(ctx) + updatedOptions := *originalOptions + updatedOptions.UseSIG = true + ctx = options.ToContext(ctx, &updatedOptions) + Expect(azureEnv.InstanceTypesProvider.UpdateInstanceTypes(ctx)).To(Succeed()) + }) + + AfterEach(func() { + ctx = options.ToContext(ctx, originalOptions) + Expect(azureEnv.InstanceTypesProvider.UpdateInstanceTypes(ctx)).To(Succeed()) }) - // Ported from VM test: "should fail to provision if ephemeral disk ask for is too large" + // For Machine API mode, this responsibility is delegated to Machine API. + // - VMs control detailed StorageProfile, DiffDiskSettings, Placement (NVMe/Cache) + // - AKS machines use OSDiskType (Managed/Ephemeral) and OSDiskSizeGB + // - AKS machines automatically handles placement decisions (NVMe vs Cache disk) + if !provisionMode.isAKSMachineMode() { + Context("Placement", func() { + It("should prefer NVMe disk if supported for ephemeral", func() { + nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, karpv1.NodeSelectorRequirementWithMinValues{ + Key: v1.LabelInstanceTypeStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{"Standard_D128ds_v6"}, + }) + nodeClass.Spec.OSDiskSizeGB = lo.ToPtr[int32](100) + + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + + vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM + Expect(vm).NotTo(BeNil()) + Expect(vm.Properties.StorageProfile.OSDisk.DiffDiskSettings).NotTo(BeNil()) + Expect(lo.FromPtr(vm.Properties.StorageProfile.OSDisk.DiffDiskSettings.Placement)).To(Equal(armcompute.DiffDiskPlacementNvmeDisk)) + }) + + It("should not select NVMe ephemeral disk placement if the sku has an nvme disk, supports ephemeral os disk, but doesnt support NVMe placement", func() { + nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, karpv1.NodeSelectorRequirementWithMinValues{ + Key: v1.LabelInstanceTypeStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{"Standard_NC24ads_A100_v4"}, + }) + + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + + vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM + Expect(vm).NotTo(BeNil()) + Expect(vm.Properties.StorageProfile.OSDisk.DiffDiskSettings).NotTo(BeNil()) + Expect(lo.FromPtr(vm.Properties.StorageProfile.OSDisk.DiffDiskSettings.Placement)).ToNot(Equal(armcompute.DiffDiskPlacementNvmeDisk)) + }) + + It("should prefer cache disk placement when both cache and temp disk support ephemeral and fit the default 128GB threshold", func() { + nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, karpv1.NodeSelectorRequirementWithMinValues{ + Key: v1.LabelInstanceTypeStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{"Standard_D64s_v3"}, + }) + + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + + vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM + Expect(vm).NotTo(BeNil()) + Expect(vm.Properties.StorageProfile.OSDisk.DiffDiskSettings).NotTo(BeNil()) + Expect(lo.FromPtr(vm.Properties.StorageProfile.OSDisk.DiffDiskSettings.Placement)).To(Equal(armcompute.DiffDiskPlacementCacheDisk)) + }) + + It("should select managed disk if cache disk is too small but temp disk supports ephemeral and fits osDiskSizeGB to have parity with the AKS Nodepool API", func() { + nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, karpv1.NodeSelectorRequirementWithMinValues{ + Key: v1.LabelInstanceTypeStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{"Standard_B20ms"}, + }) + + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + + vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM + Expect(vm).NotTo(BeNil()) + Expect(vm.Properties.StorageProfile.OSDisk.DiffDiskSettings).To(BeNil()) + }) + }) + } + It("should fail to provision if ephemeral disk ask for is too large", func() { nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, karpv1.NodeSelectorRequirementWithMinValues{ Key: v1beta1.LabelSKUStorageEphemeralOSMaxSize, Operator: v1.NodeSelectorOpGt, Values: []string{"100000"}, - }) // No InstanceType will match this requirement + }) ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) pod := coretest.UnschedulablePod() ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) ExpectNotScheduled(ctx, env.Client, pod) - }) - - // Ported from VM test: should select an ephemeral disk if LabelSKUStorageEphemeralOSMaxSize is set and os disk size fits It("should select an ephemeral disk if LabelSKUStorageEphemeralOSMaxSize is set and os disk size fits", func() { - // Select instances that support ephemeral disks nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, karpv1.NodeSelectorRequirementWithMinValues{ Key: v1beta1.LabelSKUStorageEphemeralOSMaxSize, Operator: v1.NodeSelectorOpGt, @@ -374,21 +310,24 @@ func runAKSMachineAPIFeatureTests() { ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) ExpectScheduled(ctx, env.Client, pod) - // Should select a SKU with ephemeral capability - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine - Expect(aksMachine.Properties.OperatingSystem.OSDiskType).ToNot(BeNil()) - // Should use ephemeral since we required sufficient ephemeral storage - Expect(*aksMachine.Properties.OperatingSystem.OSDiskType).To(Equal(armcontainerservice.OSDiskTypeEphemeral)) - Expect(*aksMachine.Properties.OperatingSystem.OSDiskSizeGB).To(Equal(int32(30))) + if provisionMode.isAKSMachineMode() { + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine + Expect(aksMachine.Properties.OperatingSystem.OSDiskType).ToNot(BeNil()) + Expect(*aksMachine.Properties.OperatingSystem.OSDiskType).To(Equal(armcontainerservice.OSDiskTypeEphemeral)) + Expect(aksMachine.Properties.OperatingSystem.OSDiskSizeGB).ToNot(BeNil()) + Expect(*aksMachine.Properties.OperatingSystem.OSDiskSizeGB).To(Equal(int32(30))) + } else { + vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM + Expect(vm).NotTo(BeNil()) + Expect(vm.Properties.StorageProfile.OSDisk.DiskSizeGB).NotTo(BeNil()) + Expect(*vm.Properties.StorageProfile.OSDisk.DiskSizeGB).To(Equal(int32(30))) + Expect(vm.Properties.StorageProfile.OSDisk.DiffDiskSettings).NotTo(BeNil()) + Expect(lo.FromPtr(vm.Properties.StorageProfile.OSDisk.DiffDiskSettings.Option)).To(Equal(armcompute.DiffDiskOptionsLocal)) + } }) - - // Ported from VM test: "should use ephemeral disk if supported, and set disk size to OSDiskSizeGB from node class" It("should use ephemeral disk if supported, and set disk size to OSDiskSizeGB from node class", func() { - // Configure specific OS disk size in NodeClass - nodeClass.Spec.OSDiskSizeGB = lo.ToPtr(int32(256)) - - // Select an instance type that supports the disk size + nodeClass.Spec.OSDiskSizeGB = lo.ToPtr[int32](256) nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, karpv1.NodeSelectorRequirementWithMinValues{ Key: v1.LabelInstanceTypeStable, Operator: v1.NodeSelectorOpIn, @@ -401,20 +340,24 @@ func runAKSMachineAPIFeatureTests() { ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) ExpectScheduled(ctx, env.Client, pod) - // Verify AKS machine was created with correct OS disk size - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine - Expect(aksMachine.Properties.OperatingSystem).ToNot(BeNil()) - Expect(aksMachine.Properties.OperatingSystem.OSDiskSizeGB).ToNot(BeNil()) - Expect(*aksMachine.Properties.OperatingSystem.OSDiskSizeGB).To(Equal(int32(256))) - Expect(*aksMachine.Properties.OperatingSystem.OSDiskType).To(Equal(armcontainerservice.OSDiskTypeEphemeral)) + if provisionMode.isAKSMachineMode() { + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine + Expect(aksMachine.Properties.OperatingSystem).ToNot(BeNil()) + Expect(aksMachine.Properties.OperatingSystem.OSDiskSizeGB).ToNot(BeNil()) + Expect(*aksMachine.Properties.OperatingSystem.OSDiskSizeGB).To(Equal(int32(256))) + Expect(aksMachine.Properties.OperatingSystem.OSDiskType).ToNot(BeNil()) + Expect(*aksMachine.Properties.OperatingSystem.OSDiskType).To(Equal(armcontainerservice.OSDiskTypeEphemeral)) + } else { + vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM + Expect(vm).NotTo(BeNil()) + Expect(vm.Properties.StorageProfile.OSDisk.DiskSizeGB).NotTo(BeNil()) + Expect(*vm.Properties.StorageProfile.OSDisk.DiskSizeGB).To(Equal(int32(256))) + Expect(vm.Properties.StorageProfile.OSDisk.DiffDiskSettings).NotTo(BeNil()) + Expect(lo.FromPtr(vm.Properties.StorageProfile.OSDisk.DiffDiskSettings.Option)).To(Equal(armcompute.DiffDiskOptionsLocal)) + } }) - - // Ported from VM test: "should not use ephemeral disk if ephemeral is supported, but we don't have enough space" It("should not use ephemeral disk if ephemeral is supported, but we don't have enough space", func() { - // Select Standard_D2s_v3 which supports ephemeral but has limited space - // Standard_D2s_V3 has 53GB Of CacheDisk space and 16GB of Temp Disk Space. - // With our rule of 128GB being the minimum OSDiskSize, this should fall back to managed disk nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, karpv1.NodeSelectorRequirementWithMinValues{ Key: v1.LabelInstanceTypeStable, Operator: v1.NodeSelectorOpIn, @@ -422,768 +365,2190 @@ func runAKSMachineAPIFeatureTests() { }) ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) pod := coretest.UnschedulablePod() ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) ExpectScheduled(ctx, env.Client, pod) - // Should fall back to managed disk due to insufficient ephemeral space - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine - Expect(aksMachine.Properties.OperatingSystem).ToNot(BeNil()) - Expect(aksMachine.Properties.OperatingSystem.OSDiskType).ToNot(BeNil()) - Expect(*aksMachine.Properties.OperatingSystem.OSDiskType).To(Equal(armcontainerservice.OSDiskTypeManaged)) - Expect(aksMachine.Properties.OperatingSystem.OSDiskSizeGB).ToNot(BeNil()) - Expect(*aksMachine.Properties.OperatingSystem.OSDiskSizeGB).To(Equal(int32(128))) // Default size - }) - }) - - Context("Create - Additional Configurations", func() { - It("should handle configured NodeClass", func() { - // Configure comprehensive NodeClass settings - nodeClass.Spec.Kubelet = &v1beta1.KubeletConfiguration{ - CPUManagerPolicy: lo.ToPtr("static"), - CPUCFSQuota: lo.ToPtr(true), - ImageGCHighThresholdPercent: lo.ToPtr(int32(85)), - ImageGCLowThresholdPercent: lo.ToPtr(int32(80)), - FailSwapOn: lo.ToPtr(false), - } - nodeClass.Spec.ImageFamily = lo.ToPtr(v1beta1.Ubuntu2204ImageFamily) - - // Override context to use a BYO VNet instead of managed VNet - // This allows testing custom subnet configuration (managed VNet doesn't allow custom subnets) - byoClusterSubnetID := "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resourceGroup/providers/Microsoft.Network/virtualNetworks/byo-vnet-customname/subnets/cluster-subnet" - byoOpts := test.Options(test.OptionsFields{ - ProvisionMode: lo.ToPtr(consts.ProvisionModeAKSMachineAPIHeaderBatch), - UseSIG: lo.ToPtr(true), - SubnetID: lo.ToPtr(byoClusterSubnetID), - }) - byoCtx := options.ToContext(ctx, byoOpts) - - // Extract cluster subnet components and create a test subnet in the same VNet - clusterSubnetComponents, err := utils.GetVnetSubnetIDComponents(byoClusterSubnetID) - Expect(err).ToNot(HaveOccurred()) - testSubnetID := fmt.Sprintf("/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Network/virtualNetworks/%s/subnets/nodeclass-subnet", - clusterSubnetComponents.SubscriptionID, clusterSubnetComponents.ResourceGroupName, clusterSubnetComponents.VNetName) - nodeClass.Spec.VNETSubnetID = lo.ToPtr(testSubnetID) - nodeClass.Spec.Tags = map[string]string{ - "custom-tag": "custom-value", - "environment": "test", - "team": "platform", + if provisionMode.isAKSMachineMode() { + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine + Expect(aksMachine.Properties.OperatingSystem).ToNot(BeNil()) + Expect(aksMachine.Properties.OperatingSystem.OSDiskType).ToNot(BeNil()) + Expect(*aksMachine.Properties.OperatingSystem.OSDiskType).To(Equal(armcontainerservice.OSDiskTypeManaged)) + Expect(aksMachine.Properties.OperatingSystem.OSDiskSizeGB).ToNot(BeNil()) + Expect(*aksMachine.Properties.OperatingSystem.OSDiskSizeGB).To(Equal(int32(128))) + } else { + vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM + Expect(vm).NotTo(BeNil()) + Expect(vm.Properties.StorageProfile.OSDisk.DiskSizeGB).NotTo(BeNil()) + Expect(*vm.Properties.StorageProfile.OSDisk.DiskSizeGB).To(Equal(int32(128))) + Expect(vm.Properties.StorageProfile.OSDisk.DiffDiskSettings).To(BeNil()) } - nodeClass.Spec.OSDiskSizeGB = lo.ToPtr(int32(100)) - - // Configure GPU workload to test GPU node selection - pod := coretest.UnschedulablePod(coretest.PodOptions{ - ResourceRequirements: v1.ResourceRequirements{ - Requests: v1.ResourceList{ - "nvidia.com/gpu": resource.MustParse("1"), - }, - Limits: v1.ResourceList{ - "nvidia.com/gpu": resource.MustParse("1"), - }, - }, - }) - - ExpectApplied(byoCtx, env.Client, nodePool, nodeClass) - ExpectObjectReconciled(byoCtx, env.Client, statusController, nodeClass) - ExpectProvisionedAndWaitForPromises(byoCtx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(byoCtx, env.Client, pod) - - // Verify AKS machine was created - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - input := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := input.AKSMachine - - // Verify kubelet configuration - Expect(aksMachine.Properties.Kubernetes.KubeletConfig).ToNot(BeNil()) - Expect(*aksMachine.Properties.Kubernetes.KubeletConfig.CPUManagerPolicy).To(Equal("static")) - Expect(*aksMachine.Properties.Kubernetes.KubeletConfig.CPUCfsQuota).To(Equal(true)) - Expect(*aksMachine.Properties.Kubernetes.KubeletConfig.ImageGcHighThreshold).To(Equal(int32(85))) - Expect(*aksMachine.Properties.Kubernetes.KubeletConfig.ImageGcLowThreshold).To(Equal(int32(80))) - Expect(lo.FromPtr(aksMachine.Properties.Kubernetes.KubeletConfig.FailSwapOn)).To(BeFalse()) - - // Verify image family configuration - Expect(string(*aksMachine.Properties.OperatingSystem.OSSKU)).To(Equal(v1beta1.Ubuntu2204ImageFamily)) - - // Verify subnet configuration (AKS machine should use the specified custom subnet) - Expect(aksMachine.Properties.Network).ToNot(BeNil()) - Expect(aksMachine.Properties.Network.VnetSubnetID).ToNot(BeNil()) - Expect(*aksMachine.Properties.Network.VnetSubnetID).To(Equal(testSubnetID)) - - // Verify custom tags from NodeClass - Expect(aksMachine.Properties.Tags).To(HaveKey("custom-tag")) - Expect(*aksMachine.Properties.Tags["custom-tag"]).To(Equal("custom-value")) - Expect(aksMachine.Properties.Tags).To(HaveKey("environment")) - Expect(*aksMachine.Properties.Tags["environment"]).To(Equal("test")) - Expect(aksMachine.Properties.Tags).To(HaveKey("team")) - Expect(*aksMachine.Properties.Tags["team"]).To(Equal("platform")) - - // Verify Karpenter-managed tags are still present and correct - Expect(aksMachine.Properties.Tags).To(HaveKey("karpenter.sh_nodepool")) - Expect(aksMachine.Properties.Tags["karpenter.sh_nodepool"]).To(Equal(&nodePool.Name)) - Expect(aksMachine.Properties.Tags).To(HaveKey("karpenter.azure.com_cluster")) - Expect(aksMachine.Properties.Tags["karpenter.azure.com_cluster"]).To(Equal(&testOptions.ClusterName)) - Expect(aksMachine.Properties.Tags).To(HaveKey("compute.aks.billing")) - Expect(aksMachine.Properties.Tags).To(HaveKey("karpenter.azure.com_aksmachine_nodeclaim")) - - // Verify OS disk size configuration - Expect(aksMachine.Properties.OperatingSystem).ToNot(BeNil()) - Expect(aksMachine.Properties.OperatingSystem.OSDiskSizeGB).ToNot(BeNil()) - Expect(*aksMachine.Properties.OperatingSystem.OSDiskSizeGB).To(Equal(int32(100))) - - // Verify GPU node was selected (machine should be GPU-capable) - Expect(aksMachine.Properties.Hardware).ToNot(BeNil()) - Expect(aksMachine.Properties.Hardware.VMSize).ToNot(BeNil()) - vmSize := *aksMachine.Properties.Hardware.VMSize - Expect(utils.IsNvidiaEnabledSKU(vmSize)).To(BeTrue()) - - // Verify image selection - NodeImageVersion should be set correctly - Expect(aksMachine.Properties.NodeImageVersion).ToNot(BeNil()) - Expect(*aksMachine.Properties.NodeImageVersion).To(MatchRegexp(`^AKSUbuntu-.*-.*$`)) }) - It("should handle configured NodeClaim", func() { - nodeClaim.Spec.Taints = []v1.Taint{ - {Key: "test-taint", Value: "test-value", Effect: v1.TaintEffectNoSchedule}, - } - nodeClaim.Spec.StartupTaints = []v1.Taint{ - {Key: "startup-taint", Value: "startup-value", Effect: v1.TaintEffectNoExecute}, - } - - ExpectApplied(ctx, env.Client, nodePool, nodeClass, nodeClaim) - _, err := CreateAndWaitForPromises(ctx, cloudProvider, azureEnv, nodeClaim) - Expect(err).ToNot(HaveOccurred()) - - // Verify machine was created with correct taints - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - input := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - machine := input.AKSMachine - - // Check that taints are configured - // Currently, we will use "nodeInitializationTaints" field for all taints. More details in the relevant code (aksmachineinstancehelpers.go). - Expect(machine.Properties.Kubernetes.NodeInitializationTaints).To(ContainElement(lo.ToPtr("test-taint=test-value:NoSchedule"))) - Expect(machine.Properties.Kubernetes.NodeInitializationTaints).To(ContainElement(lo.ToPtr("startup-taint=startup-value:NoExecute"))) - }) + It("should use ephemeral disk if supported, and has space of at least 128GB by default", func() { + nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, karpv1.NodeSelectorRequirementWithMinValues{ + Key: v1.LabelInstanceTypeStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{"Standard_D64s_v3"}, + }) - It("should not allow the user to override Karpenter-managed tags", func() { - nodeClass.Spec.Tags = map[string]string{ - "karpenter.azure.com/cluster": "my-override-cluster", - "karpenter.sh/nodepool": "my-override-nodepool", - } ExpectApplied(ctx, env.Client, nodePool, nodeClass) ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) pod := coretest.UnschedulablePod() ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) ExpectScheduled(ctx, env.Client, pod) - // Verify AKS machine was created with correct Karpenter-managed tags (not user overrides) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - input := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := input.AKSMachine - - // Check that AKS machine has correct Karpenter-managed tags - Expect(aksMachine.Properties.Tags).To(HaveKey("karpenter.sh_nodepool")) - Expect(aksMachine.Properties.Tags["karpenter.sh_nodepool"]).To(Equal(&nodePool.Name)) - Expect(aksMachine.Properties.Tags).To(HaveKey("karpenter.azure.com_cluster")) - Expect(aksMachine.Properties.Tags["karpenter.azure.com_cluster"]).To(Equal(&testOptions.ClusterName)) - - // Verify user-specified tags are ignored for Karpenter-managed keys - Expect(*aksMachine.Properties.Tags["karpenter.sh_nodepool"]).ToNot(Equal("my-override-nodepool")) - Expect(*aksMachine.Properties.Tags["karpenter.azure.com_cluster"]).ToNot(Equal("my-override-cluster")) + if provisionMode.isAKSMachineMode() { + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine + Expect(aksMachine.Properties.OperatingSystem).ToNot(BeNil()) + Expect(aksMachine.Properties.OperatingSystem.OSDiskType).ToNot(BeNil()) + Expect(*aksMachine.Properties.OperatingSystem.OSDiskType).To(Equal(armcontainerservice.OSDiskTypeEphemeral)) + } else { + vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM + Expect(vm).NotTo(BeNil()) + Expect(vm.Properties.StorageProfile.OSDisk.DiskSizeGB).NotTo(BeNil()) + Expect(*vm.Properties.StorageProfile.OSDisk.DiskSizeGB).To(Equal(int32(128))) + Expect(vm.Properties.StorageProfile.OSDisk.DiffDiskSettings).NotTo(BeNil()) + Expect(lo.FromPtr(vm.Properties.StorageProfile.OSDisk.DiffDiskSettings.Option)).To(Equal(armcompute.DiffDiskOptionsLocal)) + } }) }) - // Ported from VM test: "EncryptionAtHost" - Context("Create - EncryptionAtHost", func() { - It("should create AKS machine with EncryptionAtHost enabled when specified in AKSNodeClass", func() { - if nodeClass.Spec.Security == nil { - nodeClass.Spec.Security = &v1beta1.Security{} - } - nodeClass.Spec.Security.EncryptionAtHost = lo.ToPtr(true) + Context("ImageReference", func() { + It("should use shared image gallery images when options are set to UseSIG", func() { + imageOptions := *options.FromContext(ctx) + imageOptions.UseSIG = true + ctx = options.ToContext(ctx, &imageOptions) + azureEnv = test.NewEnvironment(ctx, env) + statusController = status.NewController(env.Client, azureEnv.KubernetesVersionProvider, azureEnv.ImageProvider, env.KubernetesInterface, env.KubernetesInterface, azureEnv.DynamicInterface, azureEnv.SubnetsAPI, azureEnv.DiskEncryptionSetsAPI, imageOptions.ParsedDiskEncryptionSetID, imageOptions.NetworkPolicy, imageOptions.NetworkPlugin) + cloudProvider = New(azureEnv.InstanceTypesProvider, azureEnv.VMInstanceProvider, azureEnv.AKSMachineProvider, recorder, env.Client, azureEnv.ImageProvider, azureEnv.InstanceTypeStore) + cluster = state.NewCluster(fakeClock, env.Client, cloudProvider) + coreProvisioner = provisioning.NewProvisioner(env.Client, recorder, cloudProvider, cluster, fakeClock) + ExpectApplied(ctx, env.Client, nodePool, nodeClass) ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) - pod := coretest.UnschedulablePod(coretest.PodOptions{}) ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) ExpectScheduled(ctx, env.Client, pod) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine - - Expect(aksMachine.Properties.Security).ToNot(BeNil()) - Expect(aksMachine.Properties.Security.EnableEncryptionAtHost).ToNot(BeNil()) - Expect(lo.FromPtr(aksMachine.Properties.Security.EnableEncryptionAtHost)).To(BeTrue()) + if provisionMode.isAKSMachineMode() { + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine + Expect(aksMachine.Properties.NodeImageVersion).ToNot(BeNil()) + nodeImageVersion := lo.FromPtr(aksMachine.Properties.NodeImageVersion) + Expect(nodeImageVersion).To(ContainSubstring("AKSUbuntu")) + Expect(nodeImageVersion).To(MatchRegexp(`^AKSUbuntu-.*-.*$`)) + } else { + Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM + Expect(vm.Properties.StorageProfile.ImageReference).ToNot(BeNil()) + Expect(vm.Properties.StorageProfile.ImageReference.ID).ShouldNot(BeNil()) + Expect(vm.Properties.StorageProfile.ImageReference.CommunityGalleryImageID).Should(BeNil()) + Expect(*vm.Properties.StorageProfile.ImageReference.ID).To(ContainSubstring(imageOptions.SIGSubscriptionID)) + Expect(*vm.Properties.StorageProfile.ImageReference.ID).To(ContainSubstring("AKSUbuntu")) + } }) - It("should create AKS machine with EncryptionAtHost disabled when specified in AKSNodeClass", func() { - if nodeClass.Spec.Security == nil { - nodeClass.Spec.Security = &v1beta1.Security{} - } - nodeClass.Spec.Security.EncryptionAtHost = lo.ToPtr(false) - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + // For Machine API mode, CIG is not supported (and not possible). + if !provisionMode.isAKSMachineMode() { + It("should use Community Images when options are set to UseSIG=false", func() { + options := test.Options(test.OptionsFields{ + UseSIG: lo.ToPtr(false), + }) + ctx = options.ToContext(ctx) + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + pod := coretest.UnschedulablePod(coretest.PodOptions{}) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) + Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM + Expect(vm.Properties.StorageProfile.ImageReference.CommunityGalleryImageID).Should(Not(BeNil())) + }) + } + }) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine + Context("ImageProvider + Image Family", func() { + DescribeTable("should select the right Shared Image Gallery image for a given instance type", + func(instanceType string, imageFamily string, expectedImageDefinition string, expectedGalleryRG string, expectedGalleryURL string) { + imageOptions := *options.FromContext(ctx) + imageOptions.UseSIG = true + ctx = options.ToContext(ctx, &imageOptions) + azureEnv = test.NewEnvironment(ctx, env) + statusController = status.NewController(env.Client, azureEnv.KubernetesVersionProvider, azureEnv.ImageProvider, env.KubernetesInterface, env.KubernetesInterface, azureEnv.DynamicInterface, azureEnv.SubnetsAPI, azureEnv.DiskEncryptionSetsAPI, imageOptions.ParsedDiskEncryptionSetID, imageOptions.NetworkPolicy, imageOptions.NetworkPlugin) + cloudProvider = New(azureEnv.InstanceTypesProvider, azureEnv.VMInstanceProvider, azureEnv.AKSMachineProvider, recorder, env.Client, azureEnv.ImageProvider, azureEnv.InstanceTypeStore) + cluster = state.NewCluster(fakeClock, env.Client, cloudProvider) + coreProvisioner = provisioning.NewProvisioner(env.Client, recorder, cloudProvider, cluster, fakeClock) - Expect(aksMachine.Properties.Security).ToNot(BeNil()) - Expect(aksMachine.Properties.Security.EnableEncryptionAtHost).ToNot(BeNil()) - Expect(lo.FromPtr(aksMachine.Properties.Security.EnableEncryptionAtHost)).To(BeFalse()) - }) + nodeClass.Spec.ImageFamily = lo.ToPtr(imageFamily) + coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ + Key: v1.LabelInstanceTypeStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{instanceType}}) + + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + pod := coretest.UnschedulablePod(coretest.PodOptions{}) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + + if provisionMode.isAKSMachineMode() { + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine + Expect(aksMachine.Properties.NodeImageVersion).ToNot(BeNil()) + nodeImageVersion := lo.FromPtr(aksMachine.Properties.NodeImageVersion) + Expect(nodeImageVersion).To(ContainSubstring(expectedImageDefinition)) + } else { + Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM + Expect(vm.Properties.StorageProfile.ImageReference).ToNot(BeNil()) + expectedPrefix := fmt.Sprintf("/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Compute/galleries/%s/images/%s", imageOptions.SIGSubscriptionID, expectedGalleryRG, expectedGalleryURL, expectedImageDefinition) + Expect(*vm.Properties.StorageProfile.ImageReference.ID).To(ContainSubstring(expectedPrefix)) + } + }, + Entry("Gen2, Gen1 instance type with AKSUbuntu image family", "Standard_D2_v5", v1beta1.Ubuntu2204ImageFamily, imagefamily.Ubuntu2204Gen2ImageDefinition, imagefamily.AKSUbuntuResourceGroup, imagefamily.AKSUbuntuGalleryName), + Entry("Gen1 instance type with AKSUbuntu image family", "Standard_D2_v3", v1beta1.Ubuntu2204ImageFamily, imagefamily.Ubuntu2204Gen1ImageDefinition, imagefamily.AKSUbuntuResourceGroup, imagefamily.AKSUbuntuGalleryName), + Entry("ARM instance type with AKSUbuntu image family", "Standard_D16plds_v5", v1beta1.Ubuntu2204ImageFamily, imagefamily.Ubuntu2204Gen2ArmImageDefinition, imagefamily.AKSUbuntuResourceGroup, imagefamily.AKSUbuntuGalleryName), + ) + It("should select the right Shared Image Gallery image for a given instance type, Gen2 instance type with AzureLinux image family", func() { + instanceType := "Standard_D2_v5" + imageFamily := v1beta1.AzureLinuxImageFamily + kubernetesVersion := lo.Must(env.KubernetesInterface.Discovery().ServerVersion()).String() + expectUseAzureLinux3 := imagefamily.UseAzureLinux3(kubernetesVersion) + expectedImageDefinition := lo.Ternary(expectUseAzureLinux3, imagefamily.AzureLinux3Gen2ImageDefinition, imagefamily.AzureLinuxGen2ImageDefinition) + imageOptions := *options.FromContext(ctx) + imageOptions.UseSIG = true + ctx = options.ToContext(ctx, &imageOptions) + azureEnv = test.NewEnvironment(ctx, env) + statusController = status.NewController(env.Client, azureEnv.KubernetesVersionProvider, azureEnv.ImageProvider, env.KubernetesInterface, env.KubernetesInterface, azureEnv.DynamicInterface, azureEnv.SubnetsAPI, azureEnv.DiskEncryptionSetsAPI, imageOptions.ParsedDiskEncryptionSetID, imageOptions.NetworkPolicy, imageOptions.NetworkPlugin) + cloudProvider = New(azureEnv.InstanceTypesProvider, azureEnv.VMInstanceProvider, azureEnv.AKSMachineProvider, recorder, env.Client, azureEnv.ImageProvider, azureEnv.InstanceTypeStore) + cluster = state.NewCluster(fakeClock, env.Client, cloudProvider) + coreProvisioner = provisioning.NewProvisioner(env.Client, recorder, cloudProvider, cluster, fakeClock) + + nodeClass.Spec.ImageFamily = lo.ToPtr(imageFamily) + coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ + Key: v1.LabelInstanceTypeStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{instanceType}}) - It("should create AKS machine with EncryptionAtHost disabled when not specified in AKSNodeClass", func() { ExpectApplied(ctx, env.Client, nodePool, nodeClass) ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) - pod := coretest.UnschedulablePod(coretest.PodOptions{}) ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) ExpectScheduled(ctx, env.Client, pod) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine - - // Security profile should still exist but EncryptionAtHost should be false (default) - Expect(aksMachine.Properties.Security).ToNot(BeNil()) - Expect(aksMachine.Properties.Security.EnableEncryptionAtHost).ToNot(BeNil()) - Expect(lo.FromPtr(aksMachine.Properties.Security.EnableEncryptionAtHost)).To(BeFalse()) + if provisionMode.isAKSMachineMode() { + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine + Expect(aksMachine.Properties.NodeImageVersion).ToNot(BeNil()) + nodeImageVersion := lo.FromPtr(aksMachine.Properties.NodeImageVersion) + Expect(nodeImageVersion).To(ContainSubstring(expectedImageDefinition)) + } else { + Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM + Expect(vm.Properties.StorageProfile.ImageReference).ToNot(BeNil()) + expectedPrefix := fmt.Sprintf("/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Compute/galleries/%s/images/%s", imageOptions.SIGSubscriptionID, imagefamily.AKSAzureLinuxResourceGroup, imagefamily.AKSAzureLinuxGalleryName, expectedImageDefinition) + Expect(*vm.Properties.StorageProfile.ImageReference.ID).To(ContainSubstring(expectedPrefix)) + } }) - }) + It("should select the right Shared Image Gallery image for a given instance type, Gen1 instance type with AzureLinux image family", func() { + instanceType := "Standard_D2_v3" + imageFamily := v1beta1.AzureLinuxImageFamily + kubernetesVersion := lo.Must(env.KubernetesInterface.Discovery().ServerVersion()).String() + expectUseAzureLinux3 := imagefamily.UseAzureLinux3(kubernetesVersion) + expectedImageDefinition := lo.Ternary(expectUseAzureLinux3, imagefamily.AzureLinux3Gen1ImageDefinition, imagefamily.AzureLinuxGen1ImageDefinition) + imageOptions := *options.FromContext(ctx) + imageOptions.UseSIG = true + ctx = options.ToContext(ctx, &imageOptions) + azureEnv = test.NewEnvironment(ctx, env) + statusController = status.NewController(env.Client, azureEnv.KubernetesVersionProvider, azureEnv.ImageProvider, env.KubernetesInterface, env.KubernetesInterface, azureEnv.DynamicInterface, azureEnv.SubnetsAPI, azureEnv.DiskEncryptionSetsAPI, imageOptions.ParsedDiskEncryptionSetID, imageOptions.NetworkPolicy, imageOptions.NetworkPlugin) + cloudProvider = New(azureEnv.InstanceTypesProvider, azureEnv.VMInstanceProvider, azureEnv.AKSMachineProvider, recorder, env.Client, azureEnv.ImageProvider, azureEnv.InstanceTypeStore) + cluster = state.NewCluster(fakeClock, env.Client, cloudProvider) + coreProvisioner = provisioning.NewProvisioner(env.Client, recorder, cloudProvider, cluster, fakeClock) - // Labels in the kubernetes.io/k8s.io domains were previously restricted by Karpenter core (<1.9.x) - // and are now allowed on NodeClaims. However, kubelet cannot set most of them, so they should be - // filtered out of AKS Machine NodeLabels (same as the VM path). Karpenter syncs them to the Node - // directly, so they still appear on the Node object. - DescribeTable("should handle previously reserved labels on AKS Machine create", - func(label string, expectedInNodeLabels bool) { - nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, - karpv1.NodeSelectorRequirementWithMinValues{Key: label, Operator: v1.NodeSelectorOpIn, Values: []string{"custom-value"}}, - ) - ExpectApplied(ctx, env.Client, nodePool, nodeClass) + nodeClass.Spec.ImageFamily = lo.ToPtr(imageFamily) + coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ + Key: v1.LabelInstanceTypeStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{instanceType}}) - pod := coretest.UnschedulablePod(coretest.PodOptions{NodeSelector: map[string]string{label: "custom-value"}}) + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + pod := coretest.UnschedulablePod(coretest.PodOptions{}) ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - node := ExpectScheduled(ctx, env.Client, pod) + ExpectScheduled(ctx, env.Client, pod) - // Label should always be on the Node (synced by Karpenter) - Expect(node.Labels).To(HaveKeyWithValue(label, "custom-value")) + if provisionMode.isAKSMachineMode() { + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine + Expect(aksMachine.Properties.NodeImageVersion).ToNot(BeNil()) + nodeImageVersion := lo.FromPtr(aksMachine.Properties.NodeImageVersion) + Expect(nodeImageVersion).To(ContainSubstring(expectedImageDefinition)) + } else { + Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM + Expect(vm.Properties.StorageProfile.ImageReference).ToNot(BeNil()) + expectedPrefix := fmt.Sprintf("/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Compute/galleries/%s/images/%s", imageOptions.SIGSubscriptionID, imagefamily.AKSAzureLinuxResourceGroup, imagefamily.AKSAzureLinuxGalleryName, expectedImageDefinition) + Expect(*vm.Properties.StorageProfile.ImageReference.ID).To(ContainSubstring(expectedPrefix)) + } + }) + It("should select the right Shared Image Gallery image for a given instance type, ARM instance type with AzureLinux image family", func() { + instanceType := "Standard_D16plds_v5" + imageFamily := v1beta1.AzureLinuxImageFamily + kubernetesVersion := lo.Must(env.KubernetesInterface.Discovery().ServerVersion()).String() + expectUseAzureLinux3 := imagefamily.UseAzureLinux3(kubernetesVersion) + expectedImageDefinition := lo.Ternary(expectUseAzureLinux3, imagefamily.AzureLinux3Gen2ArmImageDefinition, imagefamily.AzureLinuxGen2ArmImageDefinition) + imageOptions := *options.FromContext(ctx) + imageOptions.UseSIG = true + ctx = options.ToContext(ctx, &imageOptions) + azureEnv = test.NewEnvironment(ctx, env) + statusController = status.NewController(env.Client, azureEnv.KubernetesVersionProvider, azureEnv.ImageProvider, env.KubernetesInterface, env.KubernetesInterface, azureEnv.DynamicInterface, azureEnv.SubnetsAPI, azureEnv.DiskEncryptionSetsAPI, imageOptions.ParsedDiskEncryptionSetID, imageOptions.NetworkPolicy, imageOptions.NetworkPlugin) + cloudProvider = New(azureEnv.InstanceTypesProvider, azureEnv.VMInstanceProvider, azureEnv.AKSMachineProvider, recorder, env.Client, azureEnv.ImageProvider, azureEnv.InstanceTypeStore) + cluster = state.NewCluster(fakeClock, env.Client, cloudProvider) + coreProvisioner = provisioning.NewProvisioner(env.Client, recorder, cloudProvider, cluster, fakeClock) + + nodeClass.Spec.ImageFamily = lo.ToPtr(imageFamily) + coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ + Key: v1.LabelInstanceTypeStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{instanceType}}) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine - Expect(aksMachine.Properties.Kubernetes).ToNot(BeNil()) + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + pod := coretest.UnschedulablePod(coretest.PodOptions{}) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) - if expectedInNodeLabels { - Expect(aksMachine.Properties.Kubernetes.NodeLabels).To(HaveKeyWithValue(label, lo.ToPtr("custom-value"))) + if provisionMode.isAKSMachineMode() { + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine + Expect(aksMachine.Properties.NodeImageVersion).ToNot(BeNil()) + nodeImageVersion := lo.FromPtr(aksMachine.Properties.NodeImageVersion) + Expect(nodeImageVersion).To(ContainSubstring(expectedImageDefinition)) } else { - Expect(aksMachine.Properties.Kubernetes.NodeLabels).ToNot(HaveKey(label)) + Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM + Expect(vm.Properties.StorageProfile.ImageReference).ToNot(BeNil()) + expectedPrefix := fmt.Sprintf("/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Compute/galleries/%s/images/%s", imageOptions.SIGSubscriptionID, imagefamily.AKSAzureLinuxResourceGroup, imagefamily.AKSAzureLinuxGalleryName, expectedImageDefinition) + Expect(*vm.Properties.StorageProfile.ImageReference.ID).To(ContainSubstring(expectedPrefix)) } - }, - Entry("kubernetes.io (previously reserved)", "kubernetes.io/custom-label", false), - Entry("k8s.io (previously reserved)", "k8s.io/custom-label", false), - Entry("kubelet.kubernetes.io (kubelet-allowed)", "kubelet.kubernetes.io/custom-label", true), - ) + }) - Context("Create - LinuxOSConfig", func() { - It("should create AKS machine with full LinuxOSConfig when specified in AKSNodeClass", func() { - nodeClass.Spec.Kubelet = &v1beta1.KubeletConfiguration{ - FailSwapOn: lo.ToPtr(false), + // For Machine API mode, CIG is not supported (and not possible). + if !provisionMode.isAKSMachineMode() { + imageDefinition := func(imageDefinition string) func() string { + return func() string { return imageDefinition } + } + azureLinuxGen2ImageDefinition := func() string { + kubernetesVersion := lo.Must(env.KubernetesInterface.Discovery().ServerVersion()).String() + expectUseAzureLinux3 := imagefamily.UseAzureLinux3(kubernetesVersion) + return lo.Ternary(expectUseAzureLinux3, imagefamily.AzureLinux3Gen2ImageDefinition, imagefamily.AzureLinuxGen2ImageDefinition) } - nodeClass.Spec.LinuxOSConfig = &v1beta1.LinuxOSConfiguration{ - SwapFileSize: lo.ToPtr("1500Mi"), - TransparentHugePageDefrag: lo.ToPtr(v1beta1.TransparentHugePageDefragMadvise), - TransparentHugePageEnabled: lo.ToPtr(v1beta1.TransparentHugePageEnabledAlways), - Sysctls: &v1beta1.SysctlConfiguration{ - FsAioMaxNr: lo.ToPtr(int32(65536)), - FsFileMax: lo.ToPtr(int32(12000)), - FsInotifyMaxUserWatches: lo.ToPtr(int32(781250)), - FsNrOpen: lo.ToPtr(int32(8192)), - KernelThreadsMax: lo.ToPtr(int32(30000)), - NetCoreNetdevMaxBacklog: lo.ToPtr(int32(1000)), - NetCoreOptmemMax: lo.ToPtr(int32(20480)), - NetCoreRmemDefault: lo.ToPtr(int32(212992)), - NetCoreRmemMax: lo.ToPtr(int32(212992)), - NetCoreSomaxconn: lo.ToPtr(int32(4096)), - NetCoreWmemDefault: lo.ToPtr(int32(212992)), - NetCoreWmemMax: lo.ToPtr(int32(212992)), - NetIPv4IPLocalPortRange: lo.ToPtr("32768 60999"), - NetIPv4NeighDefaultGcThresh1: lo.ToPtr(int32(128)), - NetIPv4NeighDefaultGcThresh2: lo.ToPtr(int32(512)), - NetIPv4NeighDefaultGcThresh3: lo.ToPtr(int32(1024)), - NetIPv4TCPFinTimeout: lo.ToPtr(int32(60)), - NetIPv4TCPKeepaliveProbes: lo.ToPtr(int32(9)), - NetIPv4TCPKeepaliveTime: lo.ToPtr(int32(7200)), - NetIPv4TCPMaxSynBacklog: lo.ToPtr(int32(128)), - NetIPv4TCPMaxTwBuckets: lo.ToPtr(int32(8000)), - NetIPv4TCPTwReuse: lo.ToPtr(true), - NetIPv4TCPKeepaliveIntvl: lo.ToPtr(int32(75)), - NetNetfilterNfConntrackBuckets: lo.ToPtr(int32(65536)), - NetNetfilterNfConntrackMax: lo.ToPtr(int32(131072)), - VMMaxMapCount: lo.ToPtr(int32(65530)), - VMSwappiness: lo.ToPtr(int32(60)), - VMVfsCachePressure: lo.ToPtr(int32(100)), + azureLinuxGen1ImageDefinition := func() string { + kubernetesVersion := lo.Must(env.KubernetesInterface.Discovery().ServerVersion()).String() + expectUseAzureLinux3 := imagefamily.UseAzureLinux3(kubernetesVersion) + return lo.Ternary(expectUseAzureLinux3, imagefamily.AzureLinux3Gen1ImageDefinition, imagefamily.AzureLinuxGen1ImageDefinition) + } + azureLinuxGen2ArmImageDefinition := func() string { + kubernetesVersion := lo.Must(env.KubernetesInterface.Discovery().ServerVersion()).String() + expectUseAzureLinux3 := imagefamily.UseAzureLinux3(kubernetesVersion) + return lo.Ternary(expectUseAzureLinux3, imagefamily.AzureLinux3Gen2ArmImageDefinition, imagefamily.AzureLinuxGen2ArmImageDefinition) + } + + DescribeTable("should select the right Community Gallery image for a given instance type", + func(instanceType string, imageFamily string, expectedImageDefinition func() string, expectedGalleryURL string) { + imageOptions := test.Options(test.OptionsFields{ + UseSIG: lo.ToPtr(false), + }) + ctx = imageOptions.ToContext(ctx) + imageStatusController := status.NewController(env.Client, azureEnv.KubernetesVersionProvider, azureEnv.ImageProvider, env.KubernetesInterface, env.KubernetesInterface, azureEnv.DynamicInterface, azureEnv.SubnetsAPI, azureEnv.DiskEncryptionSetsAPI, imageOptions.ParsedDiskEncryptionSetID, imageOptions.NetworkPolicy, imageOptions.NetworkPlugin) + + nodeClass.Spec.ImageFamily = lo.ToPtr(imageFamily) + coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ + Key: v1.LabelInstanceTypeStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{instanceType}}) + + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, imageStatusController, nodeClass) + pod := coretest.UnschedulablePod(coretest.PodOptions{}) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + + Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM + Expect(vm.Properties.StorageProfile.ImageReference).ToNot(BeNil()) + Expect(vm.Properties.StorageProfile.ImageReference.CommunityGalleryImageID).ToNot(BeNil()) + parts := strings.Split(*vm.Properties.StorageProfile.ImageReference.CommunityGalleryImageID, "/") + Expect(parts[2]).To(Equal(expectedGalleryURL)) + Expect(parts[4]).To(Equal(expectedImageDefinition())) }, + Entry("Gen2, Gen1 instance type with AKSUbuntu image family", "Standard_D2_v5", v1beta1.Ubuntu2204ImageFamily, imageDefinition(imagefamily.Ubuntu2204Gen2ImageDefinition), imagefamily.AKSUbuntuPublicGalleryURL), + Entry("Gen1 instance type with AKSUbuntu image family", "Standard_D2_v3", v1beta1.Ubuntu2204ImageFamily, imageDefinition(imagefamily.Ubuntu2204Gen1ImageDefinition), imagefamily.AKSUbuntuPublicGalleryURL), + Entry("ARM instance type with AKSUbuntu image family", "Standard_D16plds_v5", v1beta1.Ubuntu2204ImageFamily, imageDefinition(imagefamily.Ubuntu2204Gen2ArmImageDefinition), imagefamily.AKSUbuntuPublicGalleryURL), + Entry("Gen2 instance type with AzureLinux image family", "Standard_D2_v5", v1beta1.AzureLinuxImageFamily, azureLinuxGen2ImageDefinition, imagefamily.AKSAzureLinuxPublicGalleryURL), + Entry("Gen1 instance type with AzureLinux image family", "Standard_D2_v3", v1beta1.AzureLinuxImageFamily, azureLinuxGen1ImageDefinition, imagefamily.AKSAzureLinuxPublicGalleryURL), + Entry("ARM instance type with AzureLinux image family", "Standard_D16plds_v5", v1beta1.AzureLinuxImageFamily, azureLinuxGen2ArmImageDefinition, imagefamily.AKSAzureLinuxPublicGalleryURL), + ) + } + }) + + Context("Nodepool with KubeletConfig", func() { + It("should support provisioning with kubeletConfig, computeResources and maxPods not specified", func() { + nodeClass.Spec.Kubelet = &v1beta1.KubeletConfiguration{ + CPUManagerPolicy: lo.ToPtr("static"), + CPUCFSQuota: lo.ToPtr(true), + CPUCFSQuotaPeriod: metav1.Duration{}, + ImageGCHighThresholdPercent: lo.ToPtr(int32(30)), + ImageGCLowThresholdPercent: lo.ToPtr(int32(20)), + TopologyManagerPolicy: lo.ToPtr("best-effort"), + AllowedUnsafeSysctls: []string{"Allowed", "Unsafe", "Sysctls"}, + ContainerLogMaxSize: lo.ToPtr("42Mi"), + ContainerLogMaxFiles: lo.ToPtr[int32](13), + PodPidsLimit: lo.ToPtr[int64](99), } + ExpectApplied(ctx, env.Client, nodePool, nodeClass) ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + + if !provisionMode.isAKSMachineMode() { + customData := ExpectDecodedCustomData(azureEnv) + + expectedFlags := map[string]string{ + "eviction-hard": "memory.available<750Mi", + "image-gc-high-threshold": "30", + "image-gc-low-threshold": "20", + "cpu-cfs-quota": "true", + "max-pods": "250", + "topology-manager-policy": "best-effort", + "container-log-max-size": "42Mi", + "allowed-unsafe-sysctls": "Allowed,Unsafe,Sysctls", + "cpu-manager-policy": "static", + "container-log-max-files": "13", + "pod-max-pids": "99", + } + + ExpectKubeletFlags(azureEnv, customData, expectedFlags) + Expect(customData).To(SatisfyAny( + ContainSubstring("--system-reserved=cpu=0,memory=0"), + ContainSubstring("--system-reserved=memory=0,cpu=0"), + )) + Expect(customData).To(SatisfyAny( + ContainSubstring("--kube-reserved=cpu=100m,memory=1843Mi"), + ContainSubstring("--kube-reserved=memory=1843Mi,cpu=100m"), + )) + } + // For Machine API mode, this responsibility is delegated to Machine API. + }) + }) + + Context("Create - Labels and Taints", func() { + type wellKnownLabelEntry struct { + name string + label string + valueFunc func() string + setupFunc func() + expectedInKubeletLabels bool + expectedOnNode bool + } + + requireFunc := func(key, value string) func() { + return func() { + nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, + karpv1.NodeSelectorRequirementWithMinValues{Key: key, Operator: v1.NodeSelectorOpIn, Values: []string{value}}, + ) + } + } + + entries := []wellKnownLabelEntry{ + {name: v1.LabelTopologyRegion, label: v1.LabelTopologyRegion, valueFunc: func() string { return fake.Region }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: karpv1.NodePoolLabelKey, label: karpv1.NodePoolLabelKey, valueFunc: func() string { return nodePool.Name }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1.LabelTopologyZone, label: v1.LabelTopologyZone, valueFunc: func() string { return fakeZone1 }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1.LabelInstanceTypeStable, label: v1.LabelInstanceTypeStable, valueFunc: func() string { return "Standard_NC24ads_A100_v4" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1.LabelOSStable, label: v1.LabelOSStable, valueFunc: func() string { return "linux" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1.LabelArchStable, label: v1.LabelArchStable, valueFunc: func() string { return "amd64" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: karpv1.CapacityTypeLabelKey, label: karpv1.CapacityTypeLabelKey, valueFunc: func() string { return "on-demand" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1beta1.LabelPlacementScope, label: v1beta1.LabelPlacementScope, valueFunc: func() string { return v1beta1.PlacementScopeZonal }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1beta1.LabelSKUName, label: v1beta1.LabelSKUName, valueFunc: func() string { return "Standard_NC24ads_A100_v4" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1beta1.LabelSKUFamily, label: v1beta1.LabelSKUFamily, valueFunc: func() string { return "N" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1beta1.LabelSKUSeries, label: v1beta1.LabelSKUSeries, valueFunc: func() string { return "NCads_v4" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1beta1.LabelSKUVersion, label: v1beta1.LabelSKUVersion, valueFunc: func() string { return "4" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1beta1.LabelSKUStorageEphemeralOSMaxSize, label: v1beta1.LabelSKUStorageEphemeralOSMaxSize, valueFunc: func() string { return "429" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1beta1.LabelSKUAcceleratedNetworking, label: v1beta1.LabelSKUAcceleratedNetworking, valueFunc: func() string { return "true" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1beta1.LabelSKUStoragePremiumCapable, label: v1beta1.LabelSKUStoragePremiumCapable, valueFunc: func() string { return "true" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1beta1.LabelSKUGPUName, label: v1beta1.LabelSKUGPUName, valueFunc: func() string { return "A100" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1beta1.LabelSKUGPUManufacturer, label: v1beta1.LabelSKUGPUManufacturer, valueFunc: func() string { return "nvidia" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1beta1.LabelSKUGPUCount, label: v1beta1.LabelSKUGPUCount, valueFunc: func() string { return "1" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1beta1.LabelSKUCPU, label: v1beta1.LabelSKUCPU, valueFunc: func() string { return "24" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1beta1.LabelSKUMemory, label: v1beta1.LabelSKUMemory, valueFunc: func() string { return "8192" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1beta1.AKSLabelCPU, label: v1beta1.AKSLabelCPU, valueFunc: func() string { return "24" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1beta1.AKSLabelMemory, label: v1beta1.AKSLabelMemory, valueFunc: func() string { return "8192" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1beta1.AKSLabelMode + "=user", label: v1beta1.AKSLabelMode, valueFunc: func() string { return "user" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1beta1.AKSLabelMode + "=system", label: v1beta1.AKSLabelMode, valueFunc: func() string { return "system" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1beta1.AKSLabelScaleSetPriority + "=regular", label: v1beta1.AKSLabelScaleSetPriority, valueFunc: func() string { return "regular" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1beta1.AKSLabelScaleSetPriority + "=spot", label: v1beta1.AKSLabelScaleSetPriority, valueFunc: func() string { return "spot" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1beta1.AKSLabelPriority + "=regular", label: v1beta1.AKSLabelPriority, valueFunc: func() string { return "regular" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1beta1.AKSLabelPriority + "=spot", label: v1beta1.AKSLabelPriority, valueFunc: func() string { return "spot" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1beta1.AKSLabelOSSKU, label: v1beta1.AKSLabelOSSKU, valueFunc: func() string { return "Ubuntu" }, expectedInKubeletLabels: true, expectedOnNode: true}, + { + name: v1beta1.AKSLabelFIPSEnabled, + label: v1beta1.AKSLabelFIPSEnabled, + setupFunc: func() { + testOptions.UseSIG = true + ctx = options.ToContext(ctx, testOptions) + nodeClass.Spec.FIPSMode = &v1beta1.FIPSModeFIPS + nodeClass.Spec.ImageFamily = lo.ToPtr(v1beta1.AzureLinuxImageFamily) + azureEnv = test.NewEnvironment(ctx, env) + statusController = status.NewController(env.Client, azureEnv.KubernetesVersionProvider, azureEnv.ImageProvider, env.KubernetesInterface, env.KubernetesInterface, azureEnv.DynamicInterface, azureEnv.SubnetsAPI, azureEnv.DiskEncryptionSetsAPI, testOptions.ParsedDiskEncryptionSetID, options.FromContext(ctx).NetworkPolicy, options.FromContext(ctx).NetworkPlugin) + cloudProvider = New(azureEnv.InstanceTypesProvider, azureEnv.VMInstanceProvider, azureEnv.AKSMachineProvider, recorder, env.Client, azureEnv.ImageProvider, azureEnv.InstanceTypeStore) + cluster = state.NewCluster(fakeClock, env.Client, cloudProvider) + coreProvisioner = provisioning.NewProvisioner(env.Client, recorder, cloudProvider, cluster, fakeClock) + ExpectApplied(ctx, env.Client, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + Expect(env.Client.Get(ctx, client.ObjectKeyFromObject(nodeClass), nodeClass)).To(Succeed()) + }, + valueFunc: func() string { return "true" }, + expectedInKubeletLabels: true, + expectedOnNode: true, + }, + {name: v1.LabelFailureDomainBetaRegion, label: v1.LabelFailureDomainBetaRegion, valueFunc: func() string { return fake.Region }, expectedInKubeletLabels: false, expectedOnNode: false}, + {name: v1.LabelFailureDomainBetaZone, label: v1.LabelFailureDomainBetaZone, valueFunc: func() string { return fakeZone1 }, expectedInKubeletLabels: false, expectedOnNode: false}, + {name: "beta.kubernetes.io/arch", label: "beta.kubernetes.io/arch", valueFunc: func() string { return "amd64" }, expectedInKubeletLabels: false, expectedOnNode: false}, + {name: "beta.kubernetes.io/os", label: "beta.kubernetes.io/os", valueFunc: func() string { return "linux" }, expectedInKubeletLabels: false, expectedOnNode: false}, + {name: v1.LabelInstanceType, label: v1.LabelInstanceType, valueFunc: func() string { return "Standard_NC24ads_A100_v4" }, expectedInKubeletLabels: false, expectedOnNode: false}, + {name: "topology.disk.csi.azure.com/zone", label: "topology.disk.csi.azure.com/zone", valueFunc: func() string { return fakeZone1 }, expectedInKubeletLabels: false, expectedOnNode: false}, + {name: v1.LabelWindowsBuild, label: v1.LabelWindowsBuild, valueFunc: func() string { return "window" }, expectedInKubeletLabels: true, expectedOnNode: false}, + {name: v1beta1.AKSLabelCluster, label: v1beta1.AKSLabelCluster, valueFunc: func() string { return "test-resourceGroup" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: "kubernetes.io (previously reserved)", label: "kubernetes.io/custom-label", setupFunc: requireFunc("kubernetes.io/custom-label", "custom-value"), valueFunc: func() string { return "custom-value" }, expectedInKubeletLabels: false, expectedOnNode: true}, + {name: "k8s.io (previously reserved)", label: "k8s.io/custom-label", setupFunc: requireFunc("k8s.io/custom-label", "custom-value"), valueFunc: func() string { return "custom-value" }, expectedInKubeletLabels: false, expectedOnNode: true}, + {name: "kubelet.kubernetes.io (kubelet-allowed)", label: "kubelet.kubernetes.io/custom-label", setupFunc: requireFunc("kubelet.kubernetes.io/custom-label", "custom-value"), valueFunc: func() string { return "custom-value" }, expectedInKubeletLabels: true, expectedOnNode: true}, + } + + nonSchedulableLabels := map[string]string{ + labels.AKSLabelRole: "agent", + v1beta1.AKSLabelKubeletIdentityClientID: test.Options().KubeletIdentityClientID, + "kubernetes.azure.com/mode": "user", + labels.AKSLabelSubnetName: "aks-subnet", + labels.AKSLabelVNetGUID: test.Options().VnetGUID, + labels.AKSLabelAzureCNIOverlay: strconv.FormatBool(true), + labels.AKSLabelPodNetworkType: consts.NetworkPluginModeOverlay, + karpv1.NodeDoNotSyncTaintsLabelKey: "true", + } + + It("entries should cover every WellKnownLabel", func() { + expectedLabels := append(karpv1.WellKnownLabels.UnsortedList(), lo.Keys(karpv1.NormalizedLabels)...) + Expect(lo.Map(entries, func(item wellKnownLabelEntry, _ int) string { return item.label })).To(ContainElements(expectedLabels)) + }) + + It("should include karpenter.sh/unregistered taint", func() { + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + + if provisionMode.isAKSMachineMode() { + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine + Expect(aksMachine.Properties.Kubernetes).ToNot(BeNil()) + Expect(aksMachine.Properties.Kubernetes.NodeInitializationTaints).To(ContainElement(lo.ToPtr(karpv1.UnregisteredNoExecuteTaint.ToString()))) + } else { + customData := ExpectDecodedCustomData(azureEnv) + kubeletFlags := customData[strings.Index(customData, "KUBELET_FLAGS=")+len("KUBELET_FLAGS=") : strings.Index(customData, "KUBELET_NODE_LABELS")] + Expect(kubeletFlags).To(ContainSubstring("--register-with-taints=" + karpv1.UnregisteredNoExecuteTaint.ToString())) + } + }) + + It("should support individual instance type labels when all pods schedule at once", func() { + allAtOnceEntries := lo.Filter(entries, func(item wellKnownLabelEntry, _ int) bool { + return item.setupFunc == nil + }) + + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + var podDetails []struct { + pod *v1.Pod + entry wellKnownLabelEntry + } + for _, item := range allAtOnceEntries { + podDetails = append(podDetails, struct { + pod *v1.Pod + entry wellKnownLabelEntry + }{ + pod: coretest.UnschedulablePod(coretest.PodOptions{NodeSelector: map[string]string{item.label: item.valueFunc()}}), + entry: item, + }) + } + pods := lo.Map(podDetails, func(detail struct { + pod *v1.Pod + entry wellKnownLabelEntry + }, _ int) *v1.Pod { + return detail.pod + }) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pods...) + + vmInputs := map[string]*fake.VirtualMachineCreateOrUpdateInput{} + if !provisionMode.isAKSMachineMode() { + for vmInput := range azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.All() { + vmInputs[*vmInput.VM.Name] = vmInput + } + } + + for _, detail := range podDetails { + key := lo.Keys(detail.pod.Spec.NodeSelector)[0] + node := ExpectScheduled(ctx, env.Client, detail.pod) + if detail.entry.expectedOnNode { + Expect(node.Labels[key]).To(Equal(detail.pod.Spec.NodeSelector[key])) + } else { + Expect(node.Labels).ToNot(HaveKey(key)) + } + + if provisionMode.isAKSMachineMode() { + vmName, err := nodeclaimutils.GetVMName(node.Spec.ProviderID) + Expect(err).ToNot(HaveOccurred()) + aksMachineName, err := instance.GetAKSMachineNameFromVMName(testOptions.AKSMachinesPoolName, vmName) + Expect(err).ToNot(HaveOccurred()) + machineID := fake.MkMachineID(testOptions.NodeResourceGroup, testOptions.ClusterName, testOptions.AKSMachinesPoolName, aksMachineName) + aksMachine, ok := azureEnv.AKSDataStorage.AKSMachines.Load(machineID) + Expect(ok).To(BeTrue()) + Expect(aksMachine.Properties.Kubernetes).ToNot(BeNil()) + if detail.entry.label == v1beta1.AKSLabelFIPSEnabled { + // Machine API takes responsibility for populating the FIPS label on the Node via kubelet. + continue + } + if v1beta1.IsAKSLabel(detail.entry.label) || labels.IsLabelKubeletManaged(detail.entry.label) || !labels.CanKubeletSetLabel(detail.entry.label) { + Expect(aksMachine.Properties.Kubernetes.NodeLabels).ToNot(HaveKey(detail.entry.label)) + continue + } + if detail.entry.expectedInKubeletLabels { + Expect(aksMachine.Properties.Kubernetes.NodeLabels).To(HaveKeyWithValue(detail.entry.label, lo.ToPtr(detail.entry.valueFunc()))) + } else { + Expect(aksMachine.Properties.Kubernetes.NodeLabels).ToNot(HaveKey(detail.entry.label)) + } + } else { + vmName, err := nodeclaimutils.GetVMName(node.Spec.ProviderID) + Expect(err).ToNot(HaveOccurred()) + vm := vmInputs[vmName].VM + Expect(vm.Properties).ToNot(BeNil()) + Expect(vm.Properties.OSProfile).ToNot(BeNil()) + Expect(vm.Properties.OSProfile.CustomData).ToNot(BeNil()) + + decodedBytes, err := base64.StdEncoding.DecodeString(*vm.Properties.OSProfile.CustomData) + Expect(err).To(Succeed()) + decodedString := string(decodedBytes[:]) + startIdx := strings.Index(decodedString, "KUBELET_NODE_LABELS=") + len("KUBELET_NODE_LABELS=") + endIdx := strings.Index(decodedString[startIdx:], "\n") + kubeletNodeLabels := decodedString[startIdx:] + if endIdx != -1 { + kubeletNodeLabels = decodedString[startIdx : startIdx+endIdx] + } + expectedLabel := fmt.Sprintf("%s=%s", detail.entry.label, detail.entry.valueFunc()) + if detail.entry.expectedInKubeletLabels { + Expect(kubeletNodeLabels).To(ContainSubstring(expectedLabel)) + } else { + Expect(kubeletNodeLabels).ToNot(ContainSubstring(expectedLabel)) + } + } + } + }) + + DescribeTable( + "should support individual instance type labels (when all pods scheduled individually)", + func(item wellKnownLabelEntry) { + if item.setupFunc != nil { + item.setupFunc() + } + + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + value := item.valueFunc() + pod := coretest.UnschedulablePod(coretest.PodOptions{NodeSelector: map[string]string{item.label: value}}) + + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + node := ExpectScheduled(ctx, env.Client, pod) + + if item.expectedOnNode { + Expect(node.Labels[item.label]).To(Equal(value)) + } else { + Expect(node.Labels).ToNot(HaveKey(item.label)) + } + + if provisionMode.isAKSMachineMode() { + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine + Expect(aksMachine.Properties.Kubernetes).ToNot(BeNil()) + if item.label == v1beta1.AKSLabelFIPSEnabled { + // Machine API takes responsibility for populating the FIPS label on the Node via kubelet. + return + } + if v1beta1.IsAKSLabel(item.label) || labels.IsLabelKubeletManaged(item.label) || !labels.CanKubeletSetLabel(item.label) { + Expect(aksMachine.Properties.Kubernetes.NodeLabels).ToNot(HaveKey(item.label)) + return + } + if item.expectedInKubeletLabels { + Expect(aksMachine.Properties.Kubernetes.NodeLabels).To(HaveKeyWithValue(item.label, lo.ToPtr(value))) + } else { + Expect(aksMachine.Properties.Kubernetes.NodeLabels).ToNot(HaveKey(item.label)) + } + } else { + Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM + Expect(vm.Properties).ToNot(BeNil()) + Expect(vm.Properties.OSProfile).ToNot(BeNil()) + Expect(vm.Properties.OSProfile.CustomData).ToNot(BeNil()) + + decodedBytes, err := base64.StdEncoding.DecodeString(*vm.Properties.OSProfile.CustomData) + Expect(err).To(Succeed()) + decodedString := string(decodedBytes[:]) + startIdx := strings.Index(decodedString, "KUBELET_NODE_LABELS=") + len("KUBELET_NODE_LABELS=") + endIdx := strings.Index(decodedString[startIdx:], "\n") + kubeletNodeLabels := decodedString[startIdx:] + if endIdx != -1 { + kubeletNodeLabels = decodedString[startIdx : startIdx+endIdx] + } + expectedLabel := fmt.Sprintf("%s=%s", item.label, value) + if item.expectedInKubeletLabels { + Expect(kubeletNodeLabels).To(ContainSubstring(expectedLabel)) + } else { + Expect(kubeletNodeLabels).ToNot(ContainSubstring(expectedLabel)) + } + } + }, + lo.Map(entries, func(item wellKnownLabelEntry, _ int) TableEntry { + return Entry(item.name, item) + }), + ) + It("should write other (non-schedulable) labels to kubelet", func() { + ExpectApplied(ctx, env.Client, nodePool, nodeClass) pod := coretest.UnschedulablePod(coretest.PodOptions{}) ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) ExpectScheduled(ctx, env.Client, pod) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine - - Expect(aksMachine.Properties.OperatingSystem).ToNot(BeNil()) - Expect(aksMachine.Properties.OperatingSystem.LinuxProfile).ToNot(BeNil()) - linuxOSConfig := aksMachine.Properties.OperatingSystem.LinuxProfile.LinuxOSConfig - Expect(linuxOSConfig).ToNot(BeNil()) - - // Verify top-level fields - Expect(lo.FromPtr(linuxOSConfig.SwapFileSizeMB)).To(Equal(int32(1500))) - Expect(lo.FromPtr(linuxOSConfig.TransparentHugePageDefrag)).To(Equal("madvise")) - Expect(lo.FromPtr(linuxOSConfig.TransparentHugePageEnabled)).To(Equal("always")) - - // Verify failSwapOn was wired through to kubelet config - Expect(aksMachine.Properties.Kubernetes.KubeletConfig).ToNot(BeNil()) - Expect(lo.FromPtr(aksMachine.Properties.Kubernetes.KubeletConfig.FailSwapOn)).To(BeFalse()) - - // Verify sysctl fields - Expect(linuxOSConfig.Sysctls).ToNot(BeNil()) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.FsAioMaxNr)).To(Equal(int32(65536))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.FsFileMax)).To(Equal(int32(12000))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.FsInotifyMaxUserWatches)).To(Equal(int32(781250))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.FsNrOpen)).To(Equal(int32(8192))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.KernelThreadsMax)).To(Equal(int32(30000))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetCoreNetdevMaxBacklog)).To(Equal(int32(1000))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetCoreOptmemMax)).To(Equal(int32(20480))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetCoreRmemDefault)).To(Equal(int32(212992))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetCoreRmemMax)).To(Equal(int32(212992))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetCoreSomaxconn)).To(Equal(int32(4096))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetCoreWmemDefault)).To(Equal(int32(212992))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetCoreWmemMax)).To(Equal(int32(212992))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetIPv4IPLocalPortRange)).To(Equal("32768 60999")) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetIPv4NeighDefaultGcThresh1)).To(Equal(int32(128))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetIPv4NeighDefaultGcThresh2)).To(Equal(int32(512))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetIPv4NeighDefaultGcThresh3)).To(Equal(int32(1024))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetIPv4TCPFinTimeout)).To(Equal(int32(60))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetIPv4TCPKeepaliveProbes)).To(Equal(int32(9))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetIPv4TCPKeepaliveTime)).To(Equal(int32(7200))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetIPv4TCPMaxSynBacklog)).To(Equal(int32(128))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetIPv4TCPMaxTwBuckets)).To(Equal(int32(8000))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetIPv4TCPTwReuse)).To(BeTrue()) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetIPv4TcpkeepaliveIntvl)).To(Equal(int32(75))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetNetfilterNfConntrackBuckets)).To(Equal(int32(65536))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetNetfilterNfConntrackMax)).To(Equal(int32(131072))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.VMMaxMapCount)).To(Equal(int32(65530))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.VMSwappiness)).To(Equal(int32(60))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.VMVfsCachePressure)).To(Equal(int32(100))) + if provisionMode.isAKSMachineMode() { + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine + Expect(aksMachine.Properties).ToNot(BeNil()) + Expect(aksMachine.Properties.Kubernetes).ToNot(BeNil()) + // Machine API owns these AKS/kubelet-managed labels and carries node mode as a first-class field, not as custom NodeLabels. + for key := range nonSchedulableLabels { + Expect(aksMachine.Properties.Kubernetes.NodeLabels).ToNot(HaveKey(key)) + } + Expect(aksMachine.Properties.Mode).ToNot(BeNil()) + Expect(*aksMachine.Properties.Mode).To(Equal(armcontainerservice.AgentPoolModeUser)) + } else { + customData := ExpectDecodedCustomData(azureEnv) + startIdx := strings.Index(customData, "KUBELET_NODE_LABELS=") + len("KUBELET_NODE_LABELS=") + endIdx := strings.Index(customData[startIdx:], "\n") + kubeletNodeLabels := customData[startIdx:] + if endIdx != -1 { + kubeletNodeLabels = customData[startIdx : startIdx+endIdx] + } + for key, value := range nonSchedulableLabels { + Expect(kubeletNodeLabels).To(ContainSubstring(fmt.Sprintf("%s=%s", key, value))) + } + } + }) + + DescribeTable("should not write restricted labels to kubelet, but should write allowed labels", func(domain string, allowed bool) { + nodePool.Spec.Template.Spec.Requirements = []karpv1.NodeSelectorRequirementWithMinValues{ + {Key: domain + "/team", Operator: v1.NodeSelectorOpExists}, + {Key: domain + "/custom-label", Operator: v1.NodeSelectorOpExists}, + {Key: "subdomain." + domain + "/custom-label", Operator: v1.NodeSelectorOpExists}, + } + + nodeSelector := map[string]string{ + domain + "/team": "team-1", + domain + "/custom-label": "custom-value", + "subdomain." + domain + "/custom-label": "custom-value", + } + + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + pod := coretest.UnschedulablePod(coretest.PodOptions{NodeSelector: nodeSelector}) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + node := ExpectScheduled(ctx, env.Client, pod) + + for key, value := range nodeSelector { + Expect(node.Labels).To(HaveKeyWithValue(key, value)) + } + + if provisionMode.isAKSMachineMode() { + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine + Expect(aksMachine.Properties.Kubernetes).ToNot(BeNil()) + for key, value := range nodeSelector { + if allowed { + Expect(aksMachine.Properties.Kubernetes.NodeLabels).To(HaveKeyWithValue(key, lo.ToPtr(value))) + } else { + Expect(aksMachine.Properties.Kubernetes.NodeLabels).ToNot(HaveKey(key)) + } + } + } else { + customData := ExpectDecodedCustomData(azureEnv) + startIdx := strings.Index(customData, "KUBELET_NODE_LABELS=") + len("KUBELET_NODE_LABELS=") + endIdx := strings.Index(customData[startIdx:], "\n") + kubeletNodeLabels := customData[startIdx:] + if endIdx != -1 { + kubeletNodeLabels = customData[startIdx : startIdx+endIdx] + } + for key, value := range nodeSelector { + expectedLabel := fmt.Sprintf("%s=%s", key, value) + if allowed { + Expect(kubeletNodeLabels).To(ContainSubstring(expectedLabel)) + } else { + Expect(kubeletNodeLabels).ToNot(ContainSubstring(expectedLabel)) + } + } + } + }, + Entry("node-restriction.kubernetes.io", "node-restriction.kubernetes.io", false), + Entry("node.kubernetes.io", "node.kubernetes.io", true), + ) + }) + + // For Machine API mode, these responsibilities are delegated to Machine API. + if !provisionMode.isAKSMachineMode() { + Context("Custom DNS", func() { + It("should support provisioning with custom DNS server from options", func() { + ctx = options.ToContext( + ctx, + test.Options(test.OptionsFields{ + ClusterDNSServiceIP: lo.ToPtr("10.244.0.1"), + }), + ) + + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + + customData := ExpectDecodedCustomData(azureEnv) + + expectedFlags := map[string]string{ + "cluster-dns": "10.244.0.1", + } + + ExpectKubeletFlags(azureEnv, customData, expectedFlags) + }) + }) + + Context("Create - Subnet", func() { + It("should use the VNET_SUBNET_ID", func() { + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + nic := azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Pop() + Expect(nic).NotTo(BeNil()) + Expect(lo.FromPtr(nic.Interface.Properties.IPConfigurations[0].Properties.Subnet.ID)).To(Equal("/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resourceGroup/providers/Microsoft.Network/virtualNetworks/aks-vnet-12345678/subnets/aks-subnet")) + }) + + It("should use the subnet specified in the nodeclass", func() { + clusterSubnetID := "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resourceGroup/providers/Microsoft.Network/virtualNetworks/byo-vnet/subnets/cluster-subnet" + nodeClassSubnetID := "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resourceGroup/providers/Microsoft.Network/virtualNetworks/byo-vnet/subnets/nodeclass-subnet" + subnetOptions := *options.FromContext(ctx) + subnetOptions.SubnetID = clusterSubnetID + ctx = options.ToContext(ctx, &subnetOptions) + nodeClass.Spec.VNETSubnetID = lo.ToPtr(nodeClassSubnetID) + + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + + nic := azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Pop() + Expect(nic).NotTo(BeNil()) + Expect(lo.FromPtr(nic.Interface.Properties.IPConfigurations[0].Properties.Subnet.ID)).To(Equal(nodeClassSubnetID)) + }) + + DescribeTable("Azure CNI node labels and agentbaker network plugin", func( + networkPlugin, networkPluginMode, networkDataplane, expectedAgentBakerNetPlugin string, + expectedNodeLabels sets.Set[string]) { + options := test.Options(test.OptionsFields{ + NetworkPlugin: lo.ToPtr(networkPlugin), + NetworkPluginMode: lo.ToPtr(networkPluginMode), + NetworkDataplane: lo.ToPtr(networkDataplane), + }) + ctx = options.ToContext(ctx) + + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + customData := ExpectDecodedCustomData(azureEnv) + + Expect(customData).To(ContainSubstring(fmt.Sprintf("NETWORK_PLUGIN=%s", expectedAgentBakerNetPlugin))) + + for label := range expectedNodeLabels { + Expect(customData).To(ContainSubstring(label)) + } + }, + Entry("Azure CNI V1", + "azure", "", "", + "azure", sets.New[string]()), + Entry("Azure CNI w Overlay", + "azure", "overlay", "", + "none", + sets.New( + "kubernetes.azure.com/azure-cni-overlay=true", + "kubernetes.azure.com/network-subnet=aks-subnet", + "kubernetes.azure.com/nodenetwork-vnetguid=a519e60a-cac0-40b2-b883-084477fe6f5c", + "kubernetes.azure.com/podnetwork-type=overlay", + )), + Entry("Network Plugin none", + "none", "", "", "none", + sets.New[string]()), + Entry("Azure CNI w Overlay w Cilium", + "azure", "overlay", "cilium", + "none", + sets.New( + "kubernetes.azure.com/azure-cni-overlay=true", + "kubernetes.azure.com/network-subnet=aks-subnet", + "kubernetes.azure.com/nodenetwork-vnetguid=a519e60a-cac0-40b2-b883-084477fe6f5c", + "kubernetes.azure.com/podnetwork-type=overlay", + "kubernetes.azure.com/ebpf-dataplane=cilium", + )), + Entry("Cilium w feature flag Microsoft.ContainerService/EnableCiliumNodeSubnet", + "azure", "", "cilium", + "none", + sets.New("kubernetes.azure.com/ebpf-dataplane=cilium")), + ) + + It("should include stateless CNI label for kubernetes 1.34+ set to true", func() { + nodeClass.Status.KubernetesVersion = lo.ToPtr("1.34.0") + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + + decodedString := ExpectDecodedCustomData(azureEnv) + Expect(decodedString).To(SatisfyAll( + ContainSubstring("kubernetes.azure.com/network-stateless-cni=true"), + )) + }) + + It("should include stateless CNI label for kubernetes < 1.34 set to false", func() { + nodeClass.Status.KubernetesVersion = lo.ToPtr("1.33.0") + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + + decodedString := ExpectDecodedCustomData(azureEnv) + Expect(decodedString).To(SatisfyAll( + ContainSubstring("kubernetes.azure.com/network-stateless-cni=false"), + )) + }) + }) + + Context("Create - Load Balancer", func() { + resourceGroup := "test-resourceGroup" + + It("should include loadbalancer backend pools the allocated VMs", func() { + standardLB := test.MakeStandardLoadBalancer(resourceGroup, loadbalancer.SLBName, true) + internalLB := test.MakeStandardLoadBalancer(resourceGroup, loadbalancer.InternalSLBName, false) + + azureEnv.LoadBalancersAPI.LoadBalancers.Store(lo.FromPtr(standardLB.ID), standardLB) + azureEnv.LoadBalancersAPI.LoadBalancers.Store(lo.FromPtr(internalLB.ID), internalLB) + + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + + Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + iface := azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Pop().Interface + + Expect(iface.Properties.IPConfigurations).ToNot(BeEmpty()) + Expect(lo.FromPtr(iface.Properties.IPConfigurations[0].Properties.Primary)).To(Equal(true)) + + backendPools := iface.Properties.IPConfigurations[0].Properties.LoadBalancerBackendAddressPools + Expect(backendPools).To(HaveLen(3)) + Expect(lo.FromPtr(backendPools[0].ID)).To(Equal("/subscriptions/subscriptionID/resourceGroups/test-resourceGroup/providers/Microsoft.Network/loadBalancers/kubernetes/backendAddressPools/kubernetes")) + Expect(lo.FromPtr(backendPools[1].ID)).To(Equal("/subscriptions/subscriptionID/resourceGroups/test-resourceGroup/providers/Microsoft.Network/loadBalancers/kubernetes/backendAddressPools/aksOutboundBackendPool")) + Expect(lo.FromPtr(backendPools[2].ID)).To(Equal("/subscriptions/subscriptionID/resourceGroups/test-resourceGroup/providers/Microsoft.Network/loadBalancers/kubernetes-internal/backendAddressPools/kubernetes")) + }) + }) + + Context("Kubenet", func() { + var originalOptions *options.Options + + BeforeEach(func() { + originalOptions = options.FromContext(ctx) + ctx = options.ToContext( + ctx, + test.Options(test.OptionsFields{ + NetworkPlugin: lo.ToPtr("kubenet"), + })) + }) + + AfterEach(func() { + ctx = options.ToContext(ctx, originalOptions) + }) + + It("should not include cilium or azure cni vnet labels", func() { + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + + customData := ExpectDecodedCustomData(azureEnv) + Expect(customData).To(Not(SatisfyAny( + ContainSubstring("kubernetes.azure.com/network-subnet=aks-subnet"), + ContainSubstring("kubernetes.azure.com/nodenetwork-vnetguid=a519e60a-cac0-40b2-b883-084477fe6f5c"), + ContainSubstring("kubernetes.azure.com/podnetwork-type=overlay"), + ))) + }) + + It("should support provisioning with kubeletConfig, computeResources and maxPods not specified", func() { + nodeClass.Spec.Kubelet = &v1beta1.KubeletConfiguration{ + CPUManagerPolicy: lo.ToPtr("static"), + CPUCFSQuota: lo.ToPtr(true), + CPUCFSQuotaPeriod: metav1.Duration{}, + ImageGCHighThresholdPercent: lo.ToPtr(int32(30)), + ImageGCLowThresholdPercent: lo.ToPtr(int32(20)), + TopologyManagerPolicy: lo.ToPtr("best-effort"), + AllowedUnsafeSysctls: []string{"Allowed", "Unsafe", "Sysctls"}, + ContainerLogMaxSize: lo.ToPtr("42Mi"), + ContainerLogMaxFiles: lo.ToPtr[int32](13), + PodPidsLimit: lo.ToPtr[int64](99), + } + + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + + customData := ExpectDecodedCustomData(azureEnv) + expectedFlags := map[string]string{ + "eviction-hard": "memory.available<750Mi", + "max-pods": "110", + "image-gc-low-threshold": "20", + "image-gc-high-threshold": "30", + "cpu-cfs-quota": "true", + "topology-manager-policy": "best-effort", + "container-log-max-size": "42Mi", + "allowed-unsafe-sysctls": "Allowed,Unsafe,Sysctls", + "cpu-manager-policy": "static", + "container-log-max-files": "13", + "pod-max-pids": "99", + } + ExpectKubeletFlags(azureEnv, customData, expectedFlags) + Expect(customData).To(SatisfyAny( + ContainSubstring("--system-reserved=cpu=0,memory=0"), + ContainSubstring("--system-reserved=memory=0,cpu=0"), + )) + Expect(customData).To(SatisfyAny( + ContainSubstring("--kube-reserved=cpu=100m,memory=1843Mi"), + ContainSubstring("--kube-reserved=memory=1843Mi,cpu=100m"), + )) + }) + + It("should support provisioning with kubeletConfig, computeResources and maxPods specified", func() { + nodeClass.Spec.Kubelet = &v1beta1.KubeletConfiguration{ + CPUManagerPolicy: lo.ToPtr("static"), + CPUCFSQuota: lo.ToPtr(true), + CPUCFSQuotaPeriod: metav1.Duration{}, + ImageGCHighThresholdPercent: lo.ToPtr(int32(30)), + ImageGCLowThresholdPercent: lo.ToPtr(int32(20)), + TopologyManagerPolicy: lo.ToPtr("best-effort"), + AllowedUnsafeSysctls: []string{"Allowed", "Unsafe", "Sysctls"}, + ContainerLogMaxSize: lo.ToPtr("42Mi"), + ContainerLogMaxFiles: lo.ToPtr[int32](13), + PodPidsLimit: lo.ToPtr[int64](99), + } + nodeClass.Spec.MaxPods = lo.ToPtr(int32(15)) + + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + + customData := ExpectDecodedCustomData(azureEnv) + expectedFlags := map[string]string{ + "eviction-hard": "memory.available<750Mi", + "max-pods": "15", + "image-gc-low-threshold": "20", + "image-gc-high-threshold": "30", + "cpu-cfs-quota": "true", + "topology-manager-policy": "best-effort", + "container-log-max-size": "42Mi", + "allowed-unsafe-sysctls": "Allowed,Unsafe,Sysctls", + "cpu-manager-policy": "static", + "container-log-max-files": "13", + "pod-max-pids": "99", + } + + ExpectKubeletFlags(azureEnv, customData, expectedFlags) + Expect(customData).To(SatisfyAny( + ContainSubstring("--system-reserved=cpu=0,memory=0"), + ContainSubstring("--system-reserved=memory=0,cpu=0"), + )) + Expect(customData).To(SatisfyAny( + ContainSubstring("--kube-reserved=cpu=100m,memory=1843Mi"), + ContainSubstring("--kube-reserved=memory=1843Mi,cpu=100m"), + )) + }) + }) + + Context("Create - VM Identity", func() { + It("should have VM identity set", func() { + ctx = options.ToContext( + ctx, + test.Options(test.OptionsFields{ + NodeIdentities: []string{ + "/subscriptions/1234/resourceGroups/mcrg/providers/Microsoft.ManagedIdentity/userAssignedIdentities/myid1", + "/subscriptions/1234/resourceGroups/mcrg/providers/Microsoft.ManagedIdentity/userAssignedIdentities/myid2", + }, + })) + + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + + Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM + Expect(vm.Identity).ToNot(BeNil()) + + Expect(lo.FromPtr(vm.Identity.Type)).To(Equal(armcompute.ResourceIdentityTypeUserAssigned)) + Expect(vm.Identity.UserAssignedIdentities).ToNot(BeNil()) + Expect(vm.Identity.UserAssignedIdentities).To(HaveLen(2)) + Expect(vm.Identity.UserAssignedIdentities).To(HaveKey("/subscriptions/1234/resourceGroups/mcrg/providers/Microsoft.ManagedIdentity/userAssignedIdentities/myid1")) + Expect(vm.Identity.UserAssignedIdentities).To(HaveKey("/subscriptions/1234/resourceGroups/mcrg/providers/Microsoft.ManagedIdentity/userAssignedIdentities/myid2")) + }) + }) + + Context("Create - VM Profile", func() { + It("should have OS disk and network interface set to auto-delete", func() { + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + + Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM + Expect(vm.Properties).ToNot(BeNil()) + + Expect(vm.Properties.StorageProfile).ToNot(BeNil()) + Expect(vm.Properties.StorageProfile.OSDisk).ToNot(BeNil()) + osDiskDeleteOption := vm.Properties.StorageProfile.OSDisk.DeleteOption + Expect(osDiskDeleteOption).ToNot(BeNil()) + Expect(lo.FromPtr(osDiskDeleteOption)).To(Equal(armcompute.DiskDeleteOptionTypesDelete)) + + Expect(vm.Properties.StorageProfile.ImageReference).ToNot(BeNil()) + + for _, nic := range vm.Properties.NetworkProfile.NetworkInterfaces { + nicDeleteOption := nic.Properties.DeleteOption + Expect(nicDeleteOption).To(Not(BeNil())) + Expect(lo.FromPtr(nicDeleteOption)).To(Equal(armcompute.DeleteOptionsDelete)) + } + }) + + It("should not create unneeded secondary ips for azure cni with overlay", func() { + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + + Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM + Expect(vm.Properties).ToNot(BeNil()) + + Expect(vm.Properties.StorageProfile.ImageReference).ToNot(BeNil()) + Expect(len(vm.Properties.NetworkProfile.NetworkInterfaces)).To(Equal(1)) + Expect(lo.FromPtr(vm.Properties.NetworkProfile.NetworkInterfaces[0].Properties.Primary)).To(BeTrue()) + + Expect(azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + nic := azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Pop().Interface + Expect(nic.Properties).ToNot(BeNil()) + + Expect(len(nic.Properties.IPConfigurations)).To(Equal(1)) + }) }) - It("should create AKS machine with only sysctls when only sysctls are specified", func() { - nodeClass.Spec.LinuxOSConfig = &v1beta1.LinuxOSConfiguration{ - Sysctls: &v1beta1.SysctlConfiguration{ - VMMaxMapCount: lo.ToPtr(int32(262144)), - VMSwappiness: lo.ToPtr(int32(10)), - }, - } - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + Context("Create - MISC Bootstrap", func() { + It("should include or exclude --keep-terminated-pod-volumes based on kubelet version", func() { + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + customData := ExpectDecodedCustomData(azureEnv) + kubeletFlags := customData[strings.Index(customData, "KUBELET_FLAGS=")+len("KUBELET_FLAGS=") : strings.Index(customData, "KUBELET_NODE_LABELS")] + + k8sVersion, err := azureEnv.KubernetesVersionProvider.KubeServerVersion(ctx) + Expect(err).To(BeNil()) + minorVersion := semver.MustParse(k8sVersion).Minor + + if minorVersion < 31 { + Expect(kubeletFlags).To(ContainSubstring("--keep-terminated-pod-volumes")) + } else { + Expect(kubeletFlags).ToNot(ContainSubstring("--keep-terminated-pod-volumes")) + } + }) + + It("should include correct flags and credential provider URL when CredentialProviderURL is not empty", func() { + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + customData := ExpectDecodedCustomData(azureEnv) + kubeletFlags := customData[strings.Index(customData, "KUBELET_FLAGS=")+len("KUBELET_FLAGS=") : strings.Index(customData, "KUBELET_NODE_LABELS")] + + k8sVersion, err := azureEnv.KubernetesVersionProvider.KubeServerVersion(ctx) + Expect(err).To(BeNil()) + credentialProviderURL := bootstrap.CredentialProviderURL(k8sVersion, "amd64") + + if credentialProviderURL != "" { + Expect(kubeletFlags).ToNot(ContainSubstring("--azure-container-registry-config")) + Expect(kubeletFlags).To(ContainSubstring("--image-credential-provider-config=/var/lib/kubelet/credential-provider-config.yaml")) + Expect(kubeletFlags).To(ContainSubstring("--image-credential-provider-bin-dir=/var/lib/kubelet/credential-provider")) + Expect(customData).To(ContainSubstring(credentialProviderURL)) + } + }) + + It("should include correct flags when CredentialProviderURL is empty", func() { + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + customData := ExpectDecodedCustomData(azureEnv) + kubeletFlags := customData[strings.Index(customData, "KUBELET_FLAGS=")+len("KUBELET_FLAGS=") : strings.Index(customData, "KUBELET_NODE_LABELS")] + + k8sVersion, err := azureEnv.KubernetesVersionProvider.KubeServerVersion(ctx) + Expect(err).To(BeNil()) + credentialProviderURL := bootstrap.CredentialProviderURL(k8sVersion, "amd64") + + if credentialProviderURL == "" { + Expect(kubeletFlags).To(ContainSubstring("--azure-container-registry-config")) + Expect(kubeletFlags).ToNot(ContainSubstring("--image-credential-provider-config")) + Expect(kubeletFlags).ToNot(ContainSubstring("--image-credential-provider-bin-dir")) + } + }) + }) + } + + // For Scriptless mode, these are not supported. + // Bootstrappingclient mode support some these, in fact, but not investing in its coverage yet due to deprecation. + if provisionMode.isAKSMachineMode() { + Context("Create - Additional Configurations", func() { + It("should handle configured NodeClass", func() { + // Configure comprehensive NodeClass settings + nodeClass.Spec.Kubelet = &v1beta1.KubeletConfiguration{ + CPUManagerPolicy: lo.ToPtr("static"), + CPUCFSQuota: lo.ToPtr(true), + ImageGCHighThresholdPercent: lo.ToPtr(int32(85)), + ImageGCLowThresholdPercent: lo.ToPtr(int32(80)), + FailSwapOn: lo.ToPtr(false), + } + nodeClass.Spec.ImageFamily = lo.ToPtr(v1beta1.Ubuntu2204ImageFamily) + + // Override context to use a BYO VNet instead of managed VNet + // This allows testing custom subnet configuration (managed VNet doesn't allow custom subnets) + byoClusterSubnetID := "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resourceGroup/providers/Microsoft.Network/virtualNetworks/byo-vnet-customname/subnets/cluster-subnet" + byoOpts := test.Options(test.OptionsFields{ + ProvisionMode: lo.ToPtr(consts.ProvisionModeAKSMachineAPIHeaderBatch), + UseSIG: lo.ToPtr(true), + SubnetID: lo.ToPtr(byoClusterSubnetID), + }) + byoCtx := options.ToContext(ctx, byoOpts) + + // Extract cluster subnet components and create a test subnet in the same VNet + clusterSubnetComponents, err := utils.GetVnetSubnetIDComponents(byoClusterSubnetID) + Expect(err).ToNot(HaveOccurred()) + testSubnetID := fmt.Sprintf("/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Network/virtualNetworks/%s/subnets/nodeclass-subnet", + clusterSubnetComponents.SubscriptionID, clusterSubnetComponents.ResourceGroupName, clusterSubnetComponents.VNetName) + nodeClass.Spec.VNETSubnetID = lo.ToPtr(testSubnetID) + nodeClass.Spec.Tags = map[string]string{ + "custom-tag": "custom-value", + "environment": "test", + "team": "platform", + } + nodeClass.Spec.OSDiskSizeGB = lo.ToPtr(int32(100)) + + // Configure GPU workload to test GPU node selection + pod := coretest.UnschedulablePod(coretest.PodOptions{ + ResourceRequirements: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + "nvidia.com/gpu": resource.MustParse("1"), + }, + Limits: v1.ResourceList{ + "nvidia.com/gpu": resource.MustParse("1"), + }, + }, + }) + + ExpectApplied(byoCtx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(byoCtx, env.Client, statusController, nodeClass) + ExpectProvisionedAndWaitForPromises(byoCtx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(byoCtx, env.Client, pod) + + // Verify AKS machine was created + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + input := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := input.AKSMachine + + // Verify kubelet configuration + Expect(aksMachine.Properties.Kubernetes.KubeletConfig).ToNot(BeNil()) + Expect(*aksMachine.Properties.Kubernetes.KubeletConfig.CPUManagerPolicy).To(Equal("static")) + Expect(*aksMachine.Properties.Kubernetes.KubeletConfig.CPUCfsQuota).To(Equal(true)) + Expect(*aksMachine.Properties.Kubernetes.KubeletConfig.ImageGcHighThreshold).To(Equal(int32(85))) + Expect(*aksMachine.Properties.Kubernetes.KubeletConfig.ImageGcLowThreshold).To(Equal(int32(80))) + Expect(lo.FromPtr(aksMachine.Properties.Kubernetes.KubeletConfig.FailSwapOn)).To(BeFalse()) + + // Verify image family configuration + Expect(string(*aksMachine.Properties.OperatingSystem.OSSKU)).To(Equal(v1beta1.Ubuntu2204ImageFamily)) + + // Verify subnet configuration (AKS machine should use the specified custom subnet) + Expect(aksMachine.Properties.Network).ToNot(BeNil()) + Expect(aksMachine.Properties.Network.VnetSubnetID).ToNot(BeNil()) + Expect(*aksMachine.Properties.Network.VnetSubnetID).To(Equal(testSubnetID)) + + // Verify custom tags from NodeClass + Expect(aksMachine.Properties.Tags).To(HaveKey("custom-tag")) + Expect(*aksMachine.Properties.Tags["custom-tag"]).To(Equal("custom-value")) + Expect(aksMachine.Properties.Tags).To(HaveKey("environment")) + Expect(*aksMachine.Properties.Tags["environment"]).To(Equal("test")) + Expect(aksMachine.Properties.Tags).To(HaveKey("team")) + Expect(*aksMachine.Properties.Tags["team"]).To(Equal("platform")) + + // Verify Karpenter-managed tags are still present and correct + Expect(aksMachine.Properties.Tags).To(HaveKey("karpenter.sh_nodepool")) + Expect(aksMachine.Properties.Tags["karpenter.sh_nodepool"]).To(Equal(&nodePool.Name)) + Expect(aksMachine.Properties.Tags).To(HaveKey("karpenter.azure.com_cluster")) + Expect(aksMachine.Properties.Tags["karpenter.azure.com_cluster"]).To(Equal(&testOptions.ClusterName)) + Expect(aksMachine.Properties.Tags).To(HaveKey("compute.aks.billing")) + Expect(aksMachine.Properties.Tags).To(HaveKey("karpenter.azure.com_aksmachine_nodeclaim")) + + // Verify OS disk size configuration + Expect(aksMachine.Properties.OperatingSystem).ToNot(BeNil()) + Expect(aksMachine.Properties.OperatingSystem.OSDiskSizeGB).ToNot(BeNil()) + Expect(*aksMachine.Properties.OperatingSystem.OSDiskSizeGB).To(Equal(int32(100))) + + // Verify GPU node was selected (machine should be GPU-capable) + Expect(aksMachine.Properties.Hardware).ToNot(BeNil()) + Expect(aksMachine.Properties.Hardware.VMSize).ToNot(BeNil()) + vmSize := *aksMachine.Properties.Hardware.VMSize + Expect(utils.IsNvidiaEnabledSKU(vmSize)).To(BeTrue()) + + // Verify image selection - NodeImageVersion should be set correctly + Expect(aksMachine.Properties.NodeImageVersion).ToNot(BeNil()) + Expect(*aksMachine.Properties.NodeImageVersion).To(MatchRegexp(`^AKSUbuntu-.*-.*$`)) + }) + + It("should handle configured NodeClaim", func() { + nodeClaim.Spec.Taints = []v1.Taint{ + {Key: "test-taint", Value: "test-value", Effect: v1.TaintEffectNoSchedule}, + } + nodeClaim.Spec.StartupTaints = []v1.Taint{ + {Key: "startup-taint", Value: "startup-value", Effect: v1.TaintEffectNoExecute}, + } + + ExpectApplied(ctx, env.Client, nodePool, nodeClass, nodeClaim) + _, err := CreateAndWaitForPromises(ctx, cloudProvider, azureEnv, nodeClaim) + Expect(err).ToNot(HaveOccurred()) + + // Verify machine was created with correct taints + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + input := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + machine := input.AKSMachine + + // Check that taints are configured + // Currently, we will use "nodeInitializationTaints" field for all taints. More details in the relevant code (aksmachineinstancehelpers.go). + Expect(machine.Properties.Kubernetes.NodeInitializationTaints).To(ContainElement(lo.ToPtr("test-taint=test-value:NoSchedule"))) + Expect(machine.Properties.Kubernetes.NodeInitializationTaints).To(ContainElement(lo.ToPtr("startup-taint=startup-value:NoExecute"))) + }) + + It("should not allow the user to override Karpenter-managed tags", func() { + nodeClass.Spec.Tags = map[string]string{ + "karpenter.azure.com/cluster": "my-override-cluster", + "karpenter.sh/nodepool": "my-override-nodepool", + } + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + + // Verify AKS machine was created with correct Karpenter-managed tags (not user overrides) + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + input := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := input.AKSMachine + + // Check that AKS machine has correct Karpenter-managed tags + Expect(aksMachine.Properties.Tags).To(HaveKey("karpenter.sh_nodepool")) + Expect(aksMachine.Properties.Tags["karpenter.sh_nodepool"]).To(Equal(&nodePool.Name)) + Expect(aksMachine.Properties.Tags).To(HaveKey("karpenter.azure.com_cluster")) + Expect(aksMachine.Properties.Tags["karpenter.azure.com_cluster"]).To(Equal(&testOptions.ClusterName)) + + // Verify user-specified tags are ignored for Karpenter-managed keys + Expect(*aksMachine.Properties.Tags["karpenter.sh_nodepool"]).ToNot(Equal("my-override-nodepool")) + Expect(*aksMachine.Properties.Tags["karpenter.azure.com_cluster"]).ToNot(Equal("my-override-cluster")) + }) + }) + + // Ported from VM test: "EncryptionAtHost" + Context("Create - EncryptionAtHost", func() { + It("should create AKS machine with EncryptionAtHost enabled when specified in AKSNodeClass", func() { + if nodeClass.Spec.Security == nil { + nodeClass.Spec.Security = &v1beta1.Security{} + } + nodeClass.Spec.Security.EncryptionAtHost = lo.ToPtr(true) + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + + pod := coretest.UnschedulablePod(coretest.PodOptions{}) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := createInput.AKSMachine + + Expect(aksMachine.Properties.Security).ToNot(BeNil()) + Expect(aksMachine.Properties.Security.EnableEncryptionAtHost).ToNot(BeNil()) + Expect(lo.FromPtr(aksMachine.Properties.Security.EnableEncryptionAtHost)).To(BeTrue()) + }) + + It("should create AKS machine with EncryptionAtHost disabled when specified in AKSNodeClass", func() { + if nodeClass.Spec.Security == nil { + nodeClass.Spec.Security = &v1beta1.Security{} + } + nodeClass.Spec.Security.EncryptionAtHost = lo.ToPtr(false) + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + + pod := coretest.UnschedulablePod(coretest.PodOptions{}) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := createInput.AKSMachine + + Expect(aksMachine.Properties.Security).ToNot(BeNil()) + Expect(aksMachine.Properties.Security.EnableEncryptionAtHost).ToNot(BeNil()) + Expect(lo.FromPtr(aksMachine.Properties.Security.EnableEncryptionAtHost)).To(BeFalse()) + }) + + It("should create AKS machine with EncryptionAtHost disabled when not specified in AKSNodeClass", func() { + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + + pod := coretest.UnschedulablePod(coretest.PodOptions{}) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := createInput.AKSMachine + + // Security profile should still exist but EncryptionAtHost should be false (default) + Expect(aksMachine.Properties.Security).ToNot(BeNil()) + Expect(aksMachine.Properties.Security.EnableEncryptionAtHost).ToNot(BeNil()) + Expect(lo.FromPtr(aksMachine.Properties.Security.EnableEncryptionAtHost)).To(BeFalse()) + }) + }) + + Context("Create - LinuxOSConfig", func() { + It("should create AKS machine with full LinuxOSConfig when specified in AKSNodeClass", func() { + nodeClass.Spec.Kubelet = &v1beta1.KubeletConfiguration{ + FailSwapOn: lo.ToPtr(false), + } + nodeClass.Spec.LinuxOSConfig = &v1beta1.LinuxOSConfiguration{ + SwapFileSize: lo.ToPtr("1500Mi"), + TransparentHugePageDefrag: lo.ToPtr(v1beta1.TransparentHugePageDefragMadvise), + TransparentHugePageEnabled: lo.ToPtr(v1beta1.TransparentHugePageEnabledAlways), + Sysctls: &v1beta1.SysctlConfiguration{ + FsAioMaxNr: lo.ToPtr(int32(65536)), + FsFileMax: lo.ToPtr(int32(12000)), + FsInotifyMaxUserWatches: lo.ToPtr(int32(781250)), + FsNrOpen: lo.ToPtr(int32(8192)), + KernelThreadsMax: lo.ToPtr(int32(30000)), + NetCoreNetdevMaxBacklog: lo.ToPtr(int32(1000)), + NetCoreOptmemMax: lo.ToPtr(int32(20480)), + NetCoreRmemDefault: lo.ToPtr(int32(212992)), + NetCoreRmemMax: lo.ToPtr(int32(212992)), + NetCoreSomaxconn: lo.ToPtr(int32(4096)), + NetCoreWmemDefault: lo.ToPtr(int32(212992)), + NetCoreWmemMax: lo.ToPtr(int32(212992)), + NetIPv4IPLocalPortRange: lo.ToPtr("32768 60999"), + NetIPv4NeighDefaultGcThresh1: lo.ToPtr(int32(128)), + NetIPv4NeighDefaultGcThresh2: lo.ToPtr(int32(512)), + NetIPv4NeighDefaultGcThresh3: lo.ToPtr(int32(1024)), + NetIPv4TCPFinTimeout: lo.ToPtr(int32(60)), + NetIPv4TCPKeepaliveProbes: lo.ToPtr(int32(9)), + NetIPv4TCPKeepaliveTime: lo.ToPtr(int32(7200)), + NetIPv4TCPMaxSynBacklog: lo.ToPtr(int32(128)), + NetIPv4TCPMaxTwBuckets: lo.ToPtr(int32(8000)), + NetIPv4TCPTwReuse: lo.ToPtr(true), + NetIPv4TCPKeepaliveIntvl: lo.ToPtr(int32(75)), + NetNetfilterNfConntrackBuckets: lo.ToPtr(int32(65536)), + NetNetfilterNfConntrackMax: lo.ToPtr(int32(131072)), + VMMaxMapCount: lo.ToPtr(int32(65530)), + VMSwappiness: lo.ToPtr(int32(60)), + VMVfsCachePressure: lo.ToPtr(int32(100)), + }, + } + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + + pod := coretest.UnschedulablePod(coretest.PodOptions{}) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := createInput.AKSMachine + + Expect(aksMachine.Properties.OperatingSystem).ToNot(BeNil()) + Expect(aksMachine.Properties.OperatingSystem.LinuxProfile).ToNot(BeNil()) + linuxOSConfig := aksMachine.Properties.OperatingSystem.LinuxProfile.LinuxOSConfig + Expect(linuxOSConfig).ToNot(BeNil()) + + // Verify top-level fields + Expect(lo.FromPtr(linuxOSConfig.SwapFileSizeMB)).To(Equal(int32(1500))) + Expect(lo.FromPtr(linuxOSConfig.TransparentHugePageDefrag)).To(Equal("madvise")) + Expect(lo.FromPtr(linuxOSConfig.TransparentHugePageEnabled)).To(Equal("always")) + + // Verify failSwapOn was wired through to kubelet config + Expect(aksMachine.Properties.Kubernetes.KubeletConfig).ToNot(BeNil()) + Expect(lo.FromPtr(aksMachine.Properties.Kubernetes.KubeletConfig.FailSwapOn)).To(BeFalse()) + + // Verify sysctl fields + Expect(linuxOSConfig.Sysctls).ToNot(BeNil()) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.FsAioMaxNr)).To(Equal(int32(65536))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.FsFileMax)).To(Equal(int32(12000))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.FsInotifyMaxUserWatches)).To(Equal(int32(781250))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.FsNrOpen)).To(Equal(int32(8192))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.KernelThreadsMax)).To(Equal(int32(30000))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetCoreNetdevMaxBacklog)).To(Equal(int32(1000))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetCoreOptmemMax)).To(Equal(int32(20480))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetCoreRmemDefault)).To(Equal(int32(212992))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetCoreRmemMax)).To(Equal(int32(212992))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetCoreSomaxconn)).To(Equal(int32(4096))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetCoreWmemDefault)).To(Equal(int32(212992))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetCoreWmemMax)).To(Equal(int32(212992))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetIPv4IPLocalPortRange)).To(Equal("32768 60999")) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetIPv4NeighDefaultGcThresh1)).To(Equal(int32(128))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetIPv4NeighDefaultGcThresh2)).To(Equal(int32(512))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetIPv4NeighDefaultGcThresh3)).To(Equal(int32(1024))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetIPv4TCPFinTimeout)).To(Equal(int32(60))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetIPv4TCPKeepaliveProbes)).To(Equal(int32(9))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetIPv4TCPKeepaliveTime)).To(Equal(int32(7200))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetIPv4TCPMaxSynBacklog)).To(Equal(int32(128))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetIPv4TCPMaxTwBuckets)).To(Equal(int32(8000))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetIPv4TCPTwReuse)).To(BeTrue()) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetIPv4TcpkeepaliveIntvl)).To(Equal(int32(75))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetNetfilterNfConntrackBuckets)).To(Equal(int32(65536))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.NetNetfilterNfConntrackMax)).To(Equal(int32(131072))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.VMMaxMapCount)).To(Equal(int32(65530))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.VMSwappiness)).To(Equal(int32(60))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.VMVfsCachePressure)).To(Equal(int32(100))) + }) + + It("should create AKS machine with only sysctls when only sysctls are specified", func() { + nodeClass.Spec.LinuxOSConfig = &v1beta1.LinuxOSConfiguration{ + Sysctls: &v1beta1.SysctlConfiguration{ + VMMaxMapCount: lo.ToPtr(int32(262144)), + VMSwappiness: lo.ToPtr(int32(10)), + }, + } + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + + pod := coretest.UnschedulablePod(coretest.PodOptions{}) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := createInput.AKSMachine + + Expect(aksMachine.Properties.OperatingSystem.LinuxProfile).ToNot(BeNil()) + linuxOSConfig := aksMachine.Properties.OperatingSystem.LinuxProfile.LinuxOSConfig + Expect(linuxOSConfig).ToNot(BeNil()) + + // Top-level fields should be nil + Expect(linuxOSConfig.SwapFileSizeMB).To(BeNil()) + Expect(linuxOSConfig.TransparentHugePageDefrag).To(BeNil()) + Expect(linuxOSConfig.TransparentHugePageEnabled).To(BeNil()) + + // Sysctls should be set + Expect(linuxOSConfig.Sysctls).ToNot(BeNil()) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.VMMaxMapCount)).To(Equal(int32(262144))) + Expect(lo.FromPtr(linuxOSConfig.Sysctls.VMSwappiness)).To(Equal(int32(10))) + + // Other sysctls should be nil + Expect(linuxOSConfig.Sysctls.FsAioMaxNr).To(BeNil()) + }) + + It("should create AKS machine with only TransparentHugePage settings when only TransparentHugePage is specified", func() { + nodeClass.Spec.LinuxOSConfig = &v1beta1.LinuxOSConfiguration{ + TransparentHugePageEnabled: lo.ToPtr(v1beta1.TransparentHugePageEnabledNever), + TransparentHugePageDefrag: lo.ToPtr(v1beta1.TransparentHugePageDefragDefer), + } + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + + pod := coretest.UnschedulablePod(coretest.PodOptions{}) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := createInput.AKSMachine + + Expect(aksMachine.Properties.OperatingSystem.LinuxProfile).ToNot(BeNil()) + linuxOSConfig := aksMachine.Properties.OperatingSystem.LinuxProfile.LinuxOSConfig + Expect(linuxOSConfig).ToNot(BeNil()) + + Expect(lo.FromPtr(linuxOSConfig.TransparentHugePageEnabled)).To(Equal("never")) + Expect(lo.FromPtr(linuxOSConfig.TransparentHugePageDefrag)).To(Equal("defer")) + Expect(linuxOSConfig.SwapFileSizeMB).To(BeNil()) + Expect(linuxOSConfig.Sysctls).To(BeNil()) + }) + + It("should create AKS machine with only SwapFileSize when only swap is specified", func() { + nodeClass.Spec.Kubelet = &v1beta1.KubeletConfiguration{ + FailSwapOn: lo.ToPtr(false), + } + nodeClass.Spec.LinuxOSConfig = &v1beta1.LinuxOSConfiguration{ + SwapFileSize: lo.ToPtr("2Gi"), + } + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + + pod := coretest.UnschedulablePod(coretest.PodOptions{}) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := createInput.AKSMachine + + Expect(aksMachine.Properties.OperatingSystem.LinuxProfile).ToNot(BeNil()) + linuxOSConfig := aksMachine.Properties.OperatingSystem.LinuxProfile.LinuxOSConfig + Expect(linuxOSConfig).ToNot(BeNil()) + Expect(lo.FromPtr(linuxOSConfig.SwapFileSizeMB)).To(Equal(int32(2048))) + Expect(linuxOSConfig.TransparentHugePageDefrag).To(BeNil()) + Expect(linuxOSConfig.TransparentHugePageEnabled).To(BeNil()) + Expect(linuxOSConfig.Sysctls).To(BeNil()) + }) + + It("should create AKS machine without LinuxProfile when LinuxOSConfig is not specified", func() { + // Explicitly ensure LinuxOSConfig is not set + nodeClass.Spec.LinuxOSConfig = nil + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + + pod := coretest.UnschedulablePod(coretest.PodOptions{}) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := createInput.AKSMachine + + Expect(aksMachine.Properties.OperatingSystem).ToNot(BeNil()) + Expect(aksMachine.Properties.OperatingSystem.LinuxProfile).To(BeNil()) + }) + }) + + Context("Create - ArtifactStreaming", func() { + It("should set ArtifactStreamingProfile when explicitly enabled", func() { + nodeClass.Spec.ArtifactStreaming = &v1beta1.ArtifactStreaming{ + Enabled: lo.ToPtr(true), + } + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + + pod := coretest.UnschedulablePod(coretest.PodOptions{}) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := createInput.AKSMachine + + Expect(aksMachine.Properties.Kubernetes).ToNot(BeNil()) + Expect(aksMachine.Properties.Kubernetes.ArtifactStreamingProfile).ToNot(BeNil()) + Expect(lo.FromPtr(aksMachine.Properties.Kubernetes.ArtifactStreamingProfile.Enabled)).To(BeTrue()) + }) + + It("should not set ArtifactStreamingProfile when not specified (defaults to disabled)", func() { + nodeClass.Spec.ArtifactStreaming = nil + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + + pod := coretest.UnschedulablePod(coretest.PodOptions{}) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := createInput.AKSMachine + + Expect(aksMachine.Properties.Kubernetes).ToNot(BeNil()) + Expect(aksMachine.Properties.Kubernetes.ArtifactStreamingProfile).To(BeNil()) + }) + + It("should not set ArtifactStreamingProfile when explicitly disabled", func() { + nodeClass.Spec.ArtifactStreaming = &v1beta1.ArtifactStreaming{ + Enabled: lo.ToPtr(false), + } + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + + pod := coretest.UnschedulablePod(coretest.PodOptions{}) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := createInput.AKSMachine + + Expect(aksMachine.Properties.Kubernetes).ToNot(BeNil()) + Expect(aksMachine.Properties.Kubernetes.ArtifactStreamingProfile).To(BeNil()) + }) + + It("should not set ArtifactStreamingProfile for ARM64 instance types even when enabled", func() { + nodeClass.Spec.ArtifactStreaming = &v1beta1.ArtifactStreaming{ + Enabled: lo.ToPtr(true), + } + // ARM64 does not support artifact streaming; IsArtifactStreamingEnabled returns false for arm64. + // Verify through the NodeClass API directly since the test environment may not have ARM64 instance types. + Expect(nodeClass.IsArtifactStreamingEnabled("arm64")).To(BeFalse()) + Expect(nodeClass.IsArtifactStreamingEnabled("amd64")).To(BeTrue()) + }) + }) + + Context("Create - LocalDNS", func() { + It("should set LocalDNSProfile with mode Required", func() { + nodeClass.Spec.LocalDNS = &v1beta1.LocalDNS{ + Mode: v1beta1.LocalDNSModeRequired, + VnetDNSOverrides: validLocalDNSOverridePair(v1beta1.LocalDNSForwardDestinationVnetDNS), + KubeDNSOverrides: validLocalDNSOverridePair(v1beta1.LocalDNSForwardDestinationClusterCoreDNS), + } + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + + pod := coretest.UnschedulablePod(coretest.PodOptions{}) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := createInput.AKSMachine + + Expect(aksMachine.Properties.LocalDNSProfile).ToNot(BeNil()) + Expect(lo.FromPtr(aksMachine.Properties.LocalDNSProfile.Mode)).To(Equal(armcontainerservice.LocalDNSModeRequired)) + Expect(aksMachine.Properties.LocalDNSProfile.VnetDNSOverrides).To(HaveLen(2)) + Expect(aksMachine.Properties.LocalDNSProfile.KubeDNSOverrides).To(HaveLen(2)) + }) + + It("should not set LocalDNSProfile when LocalDNS is nil", func() { + nodeClass.Spec.LocalDNS = nil + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + + pod := coretest.UnschedulablePod(coretest.PodOptions{}) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := createInput.AKSMachine - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine + Expect(aksMachine.Properties.LocalDNSProfile).To(BeNil()) + }) - Expect(aksMachine.Properties.OperatingSystem.LinuxProfile).ToNot(BeNil()) - linuxOSConfig := aksMachine.Properties.OperatingSystem.LinuxProfile.LinuxOSConfig - Expect(linuxOSConfig).ToNot(BeNil()) + It("should correctly convert override fields including durations", func() { + nodeClass.Spec.LocalDNS = &v1beta1.LocalDNS{ + Mode: v1beta1.LocalDNSModeRequired, + VnetDNSOverrides: []v1beta1.LocalDNSZoneOverride{ + { + Zone: ".", + ForwardDestination: v1beta1.LocalDNSForwardDestinationVnetDNS, + QueryLogging: v1beta1.LocalDNSQueryLoggingLog, + Protocol: v1beta1.LocalDNSProtocolForceTCP, + ForwardPolicy: v1beta1.LocalDNSForwardPolicyRoundRobin, + MaxConcurrent: lo.ToPtr(int32(50)), + CacheDuration: karpv1.MustParseNillableDuration("30s"), + ServeStaleDuration: karpv1.MustParseNillableDuration("60s"), + ServeStale: v1beta1.LocalDNSServeStaleImmediate, + }, + { + Zone: "cluster.local", + ForwardDestination: v1beta1.LocalDNSForwardDestinationClusterCoreDNS, + QueryLogging: v1beta1.LocalDNSQueryLoggingLog, + Protocol: v1beta1.LocalDNSProtocolPreferUDP, + ForwardPolicy: v1beta1.LocalDNSForwardPolicySequential, + MaxConcurrent: lo.ToPtr(int32(10)), + CacheDuration: karpv1.MustParseNillableDuration("10s"), + ServeStaleDuration: karpv1.MustParseNillableDuration("5s"), + ServeStale: v1beta1.LocalDNSServeStaleVerify, + }, + }, + KubeDNSOverrides: validLocalDNSOverridePair(v1beta1.LocalDNSForwardDestinationClusterCoreDNS), + } + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) - // Top-level fields should be nil - Expect(linuxOSConfig.SwapFileSizeMB).To(BeNil()) - Expect(linuxOSConfig.TransparentHugePageDefrag).To(BeNil()) - Expect(linuxOSConfig.TransparentHugePageEnabled).To(BeNil()) + pod := coretest.UnschedulablePod(coretest.PodOptions{}) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) - // Sysctls should be set - Expect(linuxOSConfig.Sysctls).ToNot(BeNil()) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.VMMaxMapCount)).To(Equal(int32(262144))) - Expect(lo.FromPtr(linuxOSConfig.Sysctls.VMSwappiness)).To(Equal(int32(10))) + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := createInput.AKSMachine - // Other sysctls should be nil - Expect(linuxOSConfig.Sysctls.FsAioMaxNr).To(BeNil()) - }) + Expect(aksMachine.Properties.LocalDNSProfile).ToNot(BeNil()) + + vnetOverride := aksMachine.Properties.LocalDNSProfile.VnetDNSOverrides["."] + Expect(vnetOverride).ToNot(BeNil()) + Expect(lo.FromPtr(vnetOverride.ForwardDestination)).To(Equal(armcontainerservice.LocalDNSForwardDestinationVnetDNS)) + Expect(lo.FromPtr(vnetOverride.QueryLogging)).To(Equal(armcontainerservice.LocalDNSQueryLoggingLog)) + Expect(lo.FromPtr(vnetOverride.Protocol)).To(Equal(armcontainerservice.LocalDNSProtocolForceTCP)) + Expect(lo.FromPtr(vnetOverride.ForwardPolicy)).To(Equal(armcontainerservice.LocalDNSForwardPolicyRoundRobin)) + Expect(lo.FromPtr(vnetOverride.MaxConcurrent)).To(Equal(int32(50))) + Expect(lo.FromPtr(vnetOverride.CacheDurationInSeconds)).To(Equal(int32(30))) + Expect(lo.FromPtr(vnetOverride.ServeStaleDurationInSeconds)).To(Equal(int32(60))) + Expect(lo.FromPtr(vnetOverride.ServeStale)).To(Equal(armcontainerservice.LocalDNSServeStaleImmediate)) + }) - It("should create AKS machine with only TransparentHugePage settings when only TransparentHugePage is specified", func() { - nodeClass.Spec.LinuxOSConfig = &v1beta1.LinuxOSConfiguration{ - TransparentHugePageEnabled: lo.ToPtr(v1beta1.TransparentHugePageEnabledNever), - TransparentHugePageDefrag: lo.ToPtr(v1beta1.TransparentHugePageDefragDefer), - } - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + It("should set LocalDNSProfile with mode Disabled", func() { + nodeClass.Spec.LocalDNS = &v1beta1.LocalDNS{ + Mode: v1beta1.LocalDNSModeDisabled, + VnetDNSOverrides: validLocalDNSOverridePair(v1beta1.LocalDNSForwardDestinationVnetDNS), + KubeDNSOverrides: validLocalDNSOverridePair(v1beta1.LocalDNSForwardDestinationClusterCoreDNS), + } + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) + pod := coretest.UnschedulablePod(coretest.PodOptions{}) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := createInput.AKSMachine - Expect(aksMachine.Properties.OperatingSystem.LinuxProfile).ToNot(BeNil()) - linuxOSConfig := aksMachine.Properties.OperatingSystem.LinuxProfile.LinuxOSConfig - Expect(linuxOSConfig).ToNot(BeNil()) + Expect(aksMachine.Properties.LocalDNSProfile).ToNot(BeNil()) + Expect(lo.FromPtr(aksMachine.Properties.LocalDNSProfile.Mode)).To(Equal(armcontainerservice.LocalDNSModeDisabled)) + }) - Expect(lo.FromPtr(linuxOSConfig.TransparentHugePageEnabled)).To(Equal("never")) - Expect(lo.FromPtr(linuxOSConfig.TransparentHugePageDefrag)).To(Equal("defer")) - Expect(linuxOSConfig.SwapFileSizeMB).To(BeNil()) - Expect(linuxOSConfig.Sysctls).To(BeNil()) - }) + It("should rewrite Preferred to Required on the wire when Status.LocalDNSState=Enabled", func() { + // Preferred is never sent downstream — Karpenter is the only kube-aware + // resolver, so ResolvedLocalDNSForWire rewrites Mode to the terminal + // value implied by Status.LocalDNSState. Enabled => Required. + nodeClass.Spec.LocalDNS = &v1beta1.LocalDNS{ + Mode: v1beta1.LocalDNSModePreferred, + VnetDNSOverrides: validLocalDNSOverridePair(v1beta1.LocalDNSForwardDestinationVnetDNS), + KubeDNSOverrides: validLocalDNSOverridePair(v1beta1.LocalDNSForwardDestinationClusterCoreDNS), + } + nodeClass.Status.LocalDNSState = lo.ToPtr(v1beta1.LocalDNSStateEnabled) + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) - It("should create AKS machine with only SwapFileSize when only swap is specified", func() { - nodeClass.Spec.Kubelet = &v1beta1.KubeletConfiguration{ - FailSwapOn: lo.ToPtr(false), - } - nodeClass.Spec.LinuxOSConfig = &v1beta1.LinuxOSConfiguration{ - SwapFileSize: lo.ToPtr("2Gi"), - } - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + pod := coretest.UnschedulablePod(coretest.PodOptions{}) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := createInput.AKSMachine - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine - - Expect(aksMachine.Properties.OperatingSystem.LinuxProfile).ToNot(BeNil()) - linuxOSConfig := aksMachine.Properties.OperatingSystem.LinuxProfile.LinuxOSConfig - Expect(linuxOSConfig).ToNot(BeNil()) - Expect(lo.FromPtr(linuxOSConfig.SwapFileSizeMB)).To(Equal(int32(2048))) - Expect(linuxOSConfig.TransparentHugePageDefrag).To(BeNil()) - Expect(linuxOSConfig.TransparentHugePageEnabled).To(BeNil()) - Expect(linuxOSConfig.Sysctls).To(BeNil()) - }) + Expect(aksMachine.Properties.LocalDNSProfile).ToNot(BeNil()) + Expect(lo.FromPtr(aksMachine.Properties.LocalDNSProfile.Mode)).To(Equal(armcontainerservice.LocalDNSModeRequired)) + }) - It("should create AKS machine without LinuxProfile when LinuxOSConfig is not specified", func() { - // Explicitly ensure LinuxOSConfig is not set - nodeClass.Spec.LinuxOSConfig = nil - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + It("should rewrite Preferred to Disabled on the wire when Status.LocalDNSState is unset", func() { + // Defense-in-depth: if Status hasn't been resolved yet, never pass + // Preferred downstream — the downstream resolver cannot see cluster + // gates and would re-decide incorrectly. Fall back to Disabled. + nodeClass.Spec.LocalDNS = &v1beta1.LocalDNS{ + Mode: v1beta1.LocalDNSModePreferred, + VnetDNSOverrides: validLocalDNSOverridePair(v1beta1.LocalDNSForwardDestinationVnetDNS), + KubeDNSOverrides: validLocalDNSOverridePair(v1beta1.LocalDNSForwardDestinationClusterCoreDNS), + } + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + // The status sub-reconciler resolves Preferred to Enabled in this + // test env (no cluster conflicts). Wipe LocalDNSState back to nil + // via a status Patch to drive the "Status not yet resolved" + // branch of ResolvedLocalDNSForWire. Re-fetch first because the + // reconcile bumped the resource version. + Expect(env.Client.Get(ctx, client.ObjectKeyFromObject(nodeClass), nodeClass)).To(Succeed()) + stored := nodeClass.DeepCopy() + nodeClass.Status.LocalDNSState = nil + Expect(env.Client.Status().Patch(ctx, nodeClass, client.MergeFrom(stored))).To(Succeed()) - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) + pod := coretest.UnschedulablePod(coretest.PodOptions{}) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := createInput.AKSMachine - Expect(aksMachine.Properties.OperatingSystem).ToNot(BeNil()) - Expect(aksMachine.Properties.OperatingSystem.LinuxProfile).To(BeNil()) - }) - }) + Expect(aksMachine.Properties.LocalDNSProfile).ToNot(BeNil()) + Expect(lo.FromPtr(aksMachine.Properties.LocalDNSProfile.Mode)).To(Equal(armcontainerservice.LocalDNSModeDisabled)) + }) - Context("Create - ArtifactStreaming", func() { - It("should set ArtifactStreamingProfile when explicitly enabled", func() { - nodeClass.Spec.ArtifactStreaming = &v1beta1.ArtifactStreaming{ - Enabled: lo.ToPtr(true), - } - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + It("should correctly convert KubeDNSOverrides field values", func() { + nodeClass.Spec.LocalDNS = &v1beta1.LocalDNS{ + Mode: v1beta1.LocalDNSModeRequired, + VnetDNSOverrides: validLocalDNSOverridePair(v1beta1.LocalDNSForwardDestinationVnetDNS), + KubeDNSOverrides: []v1beta1.LocalDNSZoneOverride{ + { + Zone: ".", + ForwardDestination: v1beta1.LocalDNSForwardDestinationClusterCoreDNS, + QueryLogging: v1beta1.LocalDNSQueryLoggingLog, + Protocol: v1beta1.LocalDNSProtocolPreferUDP, + ForwardPolicy: v1beta1.LocalDNSForwardPolicySequential, + MaxConcurrent: lo.ToPtr(int32(25)), + CacheDuration: karpv1.MustParseNillableDuration("15s"), + ServeStaleDuration: karpv1.MustParseNillableDuration("45s"), + ServeStale: v1beta1.LocalDNSServeStaleVerify, + }, + validLocalDNSZoneOverride("cluster.local", v1beta1.LocalDNSForwardDestinationClusterCoreDNS), + }, + } + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) + pod := coretest.UnschedulablePod(coretest.PodOptions{}) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := createInput.AKSMachine - Expect(aksMachine.Properties.Kubernetes).ToNot(BeNil()) - Expect(aksMachine.Properties.Kubernetes.ArtifactStreamingProfile).ToNot(BeNil()) - Expect(lo.FromPtr(aksMachine.Properties.Kubernetes.ArtifactStreamingProfile.Enabled)).To(BeTrue()) + Expect(aksMachine.Properties.LocalDNSProfile).ToNot(BeNil()) + Expect(aksMachine.Properties.LocalDNSProfile.KubeDNSOverrides).To(HaveLen(2)) + + kubeOverride := aksMachine.Properties.LocalDNSProfile.KubeDNSOverrides["."] + Expect(kubeOverride).ToNot(BeNil()) + Expect(lo.FromPtr(kubeOverride.ForwardDestination)).To(Equal(armcontainerservice.LocalDNSForwardDestinationClusterCoreDNS)) + Expect(lo.FromPtr(kubeOverride.QueryLogging)).To(Equal(armcontainerservice.LocalDNSQueryLoggingLog)) + Expect(lo.FromPtr(kubeOverride.Protocol)).To(Equal(armcontainerservice.LocalDNSProtocolPreferUDP)) + Expect(lo.FromPtr(kubeOverride.ForwardPolicy)).To(Equal(armcontainerservice.LocalDNSForwardPolicySequential)) + Expect(lo.FromPtr(kubeOverride.MaxConcurrent)).To(Equal(int32(25))) + Expect(lo.FromPtr(kubeOverride.CacheDurationInSeconds)).To(Equal(int32(15))) + Expect(lo.FromPtr(kubeOverride.ServeStaleDurationInSeconds)).To(Equal(int32(45))) + Expect(lo.FromPtr(kubeOverride.ServeStale)).To(Equal(armcontainerservice.LocalDNSServeStaleVerify)) + }) }) - It("should not set ArtifactStreamingProfile when not specified (defaults to disabled)", func() { - nodeClass.Spec.ArtifactStreaming = nil - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + } +} - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) +var _ = Describe("CloudProvider", func() { + Context("ProvisionMode = BootstrappingClient", func() { + BeforeEach(func() { + testOptions = test.Options(test.OptionsFields{ + ProvisionMode: lo.ToPtr(consts.ProvisionModeBootstrappingClient), + }) + ctx = coreoptions.ToContext(ctx, coretest.Options()) + ctx = options.ToContext(ctx, testOptions) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine + azureEnv = test.NewEnvironment(ctx, env) + azureEnvNonZonal = test.NewEnvironmentNonZonal(ctx, env) + statusController = status.NewController(env.Client, azureEnv.KubernetesVersionProvider, azureEnv.ImageProvider, env.KubernetesInterface, env.KubernetesInterface, azureEnv.DynamicInterface, azureEnv.SubnetsAPI, azureEnv.DiskEncryptionSetsAPI, testOptions.ParsedDiskEncryptionSetID, options.FromContext(ctx).NetworkPolicy, options.FromContext(ctx).NetworkPlugin) + test.ApplyDefaultStatus(nodeClass, env, testOptions.UseSIG) + cloudProvider = New(azureEnv.InstanceTypesProvider, azureEnv.VMInstanceProvider, azureEnv.AKSMachineProvider, recorder, env.Client, azureEnv.ImageProvider, azureEnv.InstanceTypeStore) + cloudProviderNonZonal = New(azureEnvNonZonal.InstanceTypesProvider, azureEnvNonZonal.VMInstanceProvider, azureEnvNonZonal.AKSMachineProvider, events.NewRecorder(&record.FakeRecorder{}), env.Client, azureEnvNonZonal.ImageProvider, azureEnvNonZonal.InstanceTypeStore) - Expect(aksMachine.Properties.Kubernetes).ToNot(BeNil()) - Expect(aksMachine.Properties.Kubernetes.ArtifactStreamingProfile).To(BeNil()) - }) + cluster = state.NewCluster(fakeClock, env.Client, cloudProvider) + clusterNonZonal = state.NewCluster(fakeClock, env.Client, cloudProviderNonZonal) + coreProvisioner = provisioning.NewProvisioner(env.Client, recorder, cloudProvider, cluster, fakeClock) + coreProvisionerNonZonal = provisioning.NewProvisioner(env.Client, recorder, cloudProviderNonZonal, clusterNonZonal, fakeClock) - It("should not set ArtifactStreamingProfile when explicitly disabled", func() { - nodeClass.Spec.ArtifactStreaming = &v1beta1.ArtifactStreaming{ - Enabled: lo.ToPtr(false), - } - ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectApplied(ctx, env.Client, nodeClass, nodePool) ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + }) - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine - - Expect(aksMachine.Properties.Kubernetes).ToNot(BeNil()) - Expect(aksMachine.Properties.Kubernetes.ArtifactStreamingProfile).To(BeNil()) + AfterEach(func() { + // Wait for any async polling goroutines to complete before resetting + cloudProvider.WaitForInstancePromises() + cluster.Reset() + azureEnv.Reset(ctx) + azureEnvNonZonal.Reset(ctx) }) - It("should not set ArtifactStreamingProfile for ARM64 instance types even when enabled", func() { - nodeClass.Spec.ArtifactStreaming = &v1beta1.ArtifactStreaming{ - Enabled: lo.ToPtr(true), + // Just for this mode, the coverage is currently unique. + // Possible to try to reunify them still. But may not worth it given the deprecation (and migration to Machine API). + Context("Create - Bootstrap", func() { + type wellKnownLabelEntry struct { + name string + label string + valueFunc func() string + setupFunc func() + expectedInKubeletLabels bool + expectedOnNode bool } - // ARM64 does not support artifact streaming; IsArtifactStreamingEnabled returns false for arm64. - // Verify through the NodeClass API directly since the test environment may not have ARM64 instance types. - Expect(nodeClass.IsArtifactStreamingEnabled("arm64")).To(BeFalse()) - Expect(nodeClass.IsArtifactStreamingEnabled("amd64")).To(BeTrue()) - }) - }) - Context("Create - LocalDNS", func() { - It("should set LocalDNSProfile with mode Required", func() { - nodeClass.Spec.LocalDNS = &v1beta1.LocalDNS{ - Mode: v1beta1.LocalDNSModeRequired, - VnetDNSOverrides: validLocalDNSOverridePair(v1beta1.LocalDNSForwardDestinationVnetDNS), - KubeDNSOverrides: validLocalDNSOverridePair(v1beta1.LocalDNSForwardDestinationClusterCoreDNS), + requireFunc := func(key, value string) func() { + return func() { + nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, + karpv1.NodeSelectorRequirementWithMinValues{Key: key, Operator: v1.NodeSelectorOpIn, Values: []string{value}}, + ) + } } - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) + entries := []wellKnownLabelEntry{ + {name: v1.LabelTopologyRegion, label: v1.LabelTopologyRegion, valueFunc: func() string { return fake.Region }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: karpv1.NodePoolLabelKey, label: karpv1.NodePoolLabelKey, valueFunc: func() string { return nodePool.Name }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1.LabelTopologyZone, label: v1.LabelTopologyZone, valueFunc: func() string { return fakeZone1 }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1.LabelInstanceTypeStable, label: v1.LabelInstanceTypeStable, valueFunc: func() string { return "Standard_NC24ads_A100_v4" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1.LabelOSStable, label: v1.LabelOSStable, valueFunc: func() string { return "linux" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1.LabelArchStable, label: v1.LabelArchStable, valueFunc: func() string { return "amd64" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: karpv1.CapacityTypeLabelKey, label: karpv1.CapacityTypeLabelKey, valueFunc: func() string { return "on-demand" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1beta1.LabelPlacementScope, label: v1beta1.LabelPlacementScope, valueFunc: func() string { return v1beta1.PlacementScopeZonal }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1beta1.LabelSKUName, label: v1beta1.LabelSKUName, valueFunc: func() string { return "Standard_NC24ads_A100_v4" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1beta1.LabelSKUFamily, label: v1beta1.LabelSKUFamily, valueFunc: func() string { return "N" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1beta1.LabelSKUSeries, label: v1beta1.LabelSKUSeries, valueFunc: func() string { return "NCads_v4" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1beta1.LabelSKUVersion, label: v1beta1.LabelSKUVersion, valueFunc: func() string { return "4" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1beta1.LabelSKUStorageEphemeralOSMaxSize, label: v1beta1.LabelSKUStorageEphemeralOSMaxSize, valueFunc: func() string { return "429" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1beta1.LabelSKUAcceleratedNetworking, label: v1beta1.LabelSKUAcceleratedNetworking, valueFunc: func() string { return "true" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1beta1.LabelSKUStoragePremiumCapable, label: v1beta1.LabelSKUStoragePremiumCapable, valueFunc: func() string { return "true" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1beta1.LabelSKUGPUName, label: v1beta1.LabelSKUGPUName, valueFunc: func() string { return "A100" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1beta1.LabelSKUGPUManufacturer, label: v1beta1.LabelSKUGPUManufacturer, valueFunc: func() string { return "nvidia" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1beta1.LabelSKUGPUCount, label: v1beta1.LabelSKUGPUCount, valueFunc: func() string { return "1" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1beta1.LabelSKUCPU, label: v1beta1.LabelSKUCPU, valueFunc: func() string { return "24" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1beta1.LabelSKUMemory, label: v1beta1.LabelSKUMemory, valueFunc: func() string { return "8192" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1beta1.AKSLabelCPU, label: v1beta1.AKSLabelCPU, valueFunc: func() string { return "24" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1beta1.AKSLabelMemory, label: v1beta1.AKSLabelMemory, valueFunc: func() string { return "8192" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1beta1.AKSLabelMode + "=user", label: v1beta1.AKSLabelMode, valueFunc: func() string { return "user" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1beta1.AKSLabelMode + "=system", label: v1beta1.AKSLabelMode, valueFunc: func() string { return "system" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1beta1.AKSLabelScaleSetPriority + "=regular", label: v1beta1.AKSLabelScaleSetPriority, valueFunc: func() string { return "regular" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1beta1.AKSLabelScaleSetPriority + "=spot", label: v1beta1.AKSLabelScaleSetPriority, valueFunc: func() string { return "spot" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1beta1.AKSLabelPriority + "=regular", label: v1beta1.AKSLabelPriority, valueFunc: func() string { return "regular" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1beta1.AKSLabelPriority + "=spot", label: v1beta1.AKSLabelPriority, valueFunc: func() string { return "spot" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: v1beta1.AKSLabelOSSKU, label: v1beta1.AKSLabelOSSKU, valueFunc: func() string { return "Ubuntu" }, expectedInKubeletLabels: true, expectedOnNode: true}, + { + name: v1beta1.AKSLabelFIPSEnabled, + label: v1beta1.AKSLabelFIPSEnabled, + setupFunc: func() { + testOptions.UseSIG = true + ctx = options.ToContext(ctx, testOptions) + nodeClass.Spec.FIPSMode = &v1beta1.FIPSModeFIPS + nodeClass.Spec.ImageFamily = lo.ToPtr(v1beta1.AzureLinuxImageFamily) + azureEnv = test.NewEnvironment(ctx, env) + statusController = status.NewController(env.Client, azureEnv.KubernetesVersionProvider, azureEnv.ImageProvider, env.KubernetesInterface, env.KubernetesInterface, azureEnv.DynamicInterface, azureEnv.SubnetsAPI, azureEnv.DiskEncryptionSetsAPI, testOptions.ParsedDiskEncryptionSetID, options.FromContext(ctx).NetworkPolicy, options.FromContext(ctx).NetworkPlugin) + cloudProvider = New(azureEnv.InstanceTypesProvider, azureEnv.VMInstanceProvider, azureEnv.AKSMachineProvider, recorder, env.Client, azureEnv.ImageProvider, azureEnv.InstanceTypeStore) + cluster = state.NewCluster(fakeClock, env.Client, cloudProvider) + coreProvisioner = provisioning.NewProvisioner(env.Client, recorder, cloudProvider, cluster, fakeClock) + ExpectApplied(ctx, env.Client, nodeClass) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + Expect(env.Client.Get(ctx, client.ObjectKeyFromObject(nodeClass), nodeClass)).To(Succeed()) + }, + valueFunc: func() string { return "true" }, + expectedInKubeletLabels: true, + expectedOnNode: true, + }, + {name: v1.LabelFailureDomainBetaRegion, label: v1.LabelFailureDomainBetaRegion, valueFunc: func() string { return fake.Region }, expectedInKubeletLabels: false, expectedOnNode: false}, + {name: v1.LabelFailureDomainBetaZone, label: v1.LabelFailureDomainBetaZone, valueFunc: func() string { return fakeZone1 }, expectedInKubeletLabels: false, expectedOnNode: false}, + {name: "beta.kubernetes.io/arch", label: "beta.kubernetes.io/arch", valueFunc: func() string { return "amd64" }, expectedInKubeletLabels: false, expectedOnNode: false}, + {name: "beta.kubernetes.io/os", label: "beta.kubernetes.io/os", valueFunc: func() string { return "linux" }, expectedInKubeletLabels: false, expectedOnNode: false}, + {name: v1.LabelInstanceType, label: v1.LabelInstanceType, valueFunc: func() string { return "Standard_NC24ads_A100_v4" }, expectedInKubeletLabels: false, expectedOnNode: false}, + {name: "topology.disk.csi.azure.com/zone", label: "topology.disk.csi.azure.com/zone", valueFunc: func() string { return fakeZone1 }, expectedInKubeletLabels: false, expectedOnNode: false}, + {name: v1.LabelWindowsBuild, label: v1.LabelWindowsBuild, valueFunc: func() string { return "window" }, expectedInKubeletLabels: true, expectedOnNode: false}, + {name: v1beta1.AKSLabelCluster, label: v1beta1.AKSLabelCluster, valueFunc: func() string { return "test-resourceGroup" }, expectedInKubeletLabels: true, expectedOnNode: true}, + {name: "kubernetes.io (previously reserved)", label: "kubernetes.io/custom-label", setupFunc: requireFunc("kubernetes.io/custom-label", "custom-value"), valueFunc: func() string { return "custom-value" }, expectedInKubeletLabels: false, expectedOnNode: true}, + {name: "k8s.io (previously reserved)", label: "k8s.io/custom-label", setupFunc: requireFunc("k8s.io/custom-label", "custom-value"), valueFunc: func() string { return "custom-value" }, expectedInKubeletLabels: false, expectedOnNode: true}, + {name: "kubelet.kubernetes.io (kubelet-allowed)", label: "kubelet.kubernetes.io/custom-label", setupFunc: requireFunc("kubelet.kubernetes.io/custom-label", "custom-value"), valueFunc: func() string { return "custom-value" }, expectedInKubeletLabels: true, expectedOnNode: true}, + } - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine + It("entries should cover every WellKnownLabel", func() { + expectedLabels := append(karpv1.WellKnownLabels.UnsortedList(), lo.Keys(karpv1.NormalizedLabels)...) + Expect(lo.Map(entries, func(item wellKnownLabelEntry, _ int) string { return item.label })).To(ContainElements(expectedLabels)) + }) - Expect(aksMachine.Properties.LocalDNSProfile).ToNot(BeNil()) - Expect(lo.FromPtr(aksMachine.Properties.LocalDNSProfile.Mode)).To(Equal(armcontainerservice.LocalDNSModeRequired)) - Expect(aksMachine.Properties.LocalDNSProfile.VnetDNSOverrides).To(HaveLen(2)) - Expect(aksMachine.Properties.LocalDNSProfile.KubeDNSOverrides).To(HaveLen(2)) - }) + nonSchedulableLabels := map[string]string{ + labels.AKSLabelRole: "agent", + v1beta1.AKSLabelKubeletIdentityClientID: test.Options().KubeletIdentityClientID, + "kubernetes.azure.com/mode": "user", + labels.AKSLabelSubnetName: "aks-subnet", + labels.AKSLabelVNetGUID: test.Options().VnetGUID, + labels.AKSLabelAzureCNIOverlay: strconv.FormatBool(true), + labels.AKSLabelPodNetworkType: consts.NetworkPluginModeOverlay, + karpv1.NodeDoNotSyncTaintsLabelKey: "true", + } - It("should not set LocalDNSProfile when LocalDNS is nil", func() { - nodeClass.Spec.LocalDNS = nil - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + It("should provision the node and CSE", func() { + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectCSEProvisioned(azureEnv) + ExpectScheduled(ctx, env.Client, pod) + }) - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) + DescribeTable( + "should support individual instance type labels (when all pods scheduled individually) on bootstrap API", + func(item wellKnownLabelEntry) { + if item.setupFunc != nil { + item.setupFunc() + } + + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + value := item.valueFunc() + pod := coretest.UnschedulablePod(coretest.PodOptions{NodeSelector: map[string]string{item.label: value}}) + if item.label != v1.LabelWindowsBuild { + bindings := []Bindings{} + for range 3 { + bindings = append(bindings, ExpectProvisionedNoBinding(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod)) + } + for i := range len(bindings) { + Expect(lo.Values(bindings[i])).ToNot(BeEmpty()) + Expect(lo.Values(bindings[i])[0].Node.Name).To(Equal(lo.Values(bindings[0])[0].Node.Name), "expected all bindings to have the same node name") + } + } + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + node := ExpectScheduled(ctx, env.Client, pod) + + if item.expectedOnNode { + Expect(node.Labels[item.label]).To(Equal(value)) + } else { + Expect(node.Labels).ToNot(HaveKey(item.label)) + } + + Expect(azureEnv.NodeBootstrappingAPI.NodeBootstrappingGetBehavior.CalledWithInput.Len()).To(Equal(1)) + bootstrapInput := azureEnv.NodeBootstrappingAPI.NodeBootstrappingGetBehavior.CalledWithInput.Pop() + if item.expectedInKubeletLabels { + Expect(bootstrapInput.Params.ProvisionProfile.CustomNodeLabels).To(HaveKeyWithValue(item.label, value)) + } else { + Expect(bootstrapInput.Params.ProvisionProfile.CustomNodeLabels).ToNot(HaveKeyWithValue(item.label, value)) + } + }, + lo.Map(entries, func(item wellKnownLabelEntry, _ int) TableEntry { + return Entry(item.name, item) + }), + ) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine + It("should write other (non-schedulable) labels to kubelet on bootstrap API", func() { + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + pod := coretest.UnschedulablePod(coretest.PodOptions{}) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) - Expect(aksMachine.Properties.LocalDNSProfile).To(BeNil()) - }) + Expect(azureEnv.NodeBootstrappingAPI.NodeBootstrappingGetBehavior.CalledWithInput.Len()).To(Equal(1)) + bootstrapInput := azureEnv.NodeBootstrappingAPI.NodeBootstrappingGetBehavior.CalledWithInput.Pop() + for key, value := range nonSchedulableLabels { + Expect(bootstrapInput.Params.ProvisionProfile.CustomNodeLabels).To(HaveKeyWithValue(key, value)) + } + }) - It("should correctly convert override fields including durations", func() { - nodeClass.Spec.LocalDNS = &v1beta1.LocalDNS{ - Mode: v1beta1.LocalDNSModeRequired, - VnetDNSOverrides: []v1beta1.LocalDNSZoneOverride{ - { - Zone: ".", - ForwardDestination: v1beta1.LocalDNSForwardDestinationVnetDNS, - QueryLogging: v1beta1.LocalDNSQueryLoggingLog, - Protocol: v1beta1.LocalDNSProtocolForceTCP, - ForwardPolicy: v1beta1.LocalDNSForwardPolicyRoundRobin, - MaxConcurrent: lo.ToPtr(int32(50)), - CacheDuration: karpv1.MustParseNillableDuration("30s"), - ServeStaleDuration: karpv1.MustParseNillableDuration("60s"), - ServeStale: v1beta1.LocalDNSServeStaleImmediate, + It("should not reattempt creation of a vm thats been created before, and also not CSE", func() { + nodeClaim := coretest.NodeClaim(karpv1.NodeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{karpv1.NodePoolLabelKey: nodePool.Name}, }, - { - Zone: "cluster.local", - ForwardDestination: v1beta1.LocalDNSForwardDestinationClusterCoreDNS, - QueryLogging: v1beta1.LocalDNSQueryLoggingLog, - Protocol: v1beta1.LocalDNSProtocolPreferUDP, - ForwardPolicy: v1beta1.LocalDNSForwardPolicySequential, - MaxConcurrent: lo.ToPtr(int32(10)), - CacheDuration: karpv1.MustParseNillableDuration("10s"), - ServeStaleDuration: karpv1.MustParseNillableDuration("5s"), - ServeStale: v1beta1.LocalDNSServeStaleVerify, + Spec: karpv1.NodeClaimSpec{NodeClassRef: &karpv1.NodeClassReference{Name: nodeClass.Name}}, + }) + vmName := instance.GenerateResourceName(nodeClaim.Name) + vm := &armcompute.VirtualMachine{ + Name: lo.ToPtr(vmName), + ID: lo.ToPtr(fake.MkVMID(options.FromContext(ctx).NodeResourceGroup, vmName)), + Location: lo.ToPtr(fake.Region), + Zones: []*string{lo.ToPtr("fantasy-zone")}, + Properties: &armcompute.VirtualMachineProperties{ + TimeCreated: lo.ToPtr(time.Now()), + HardwareProfile: &armcompute.HardwareProfile{ + VMSize: lo.ToPtr(armcompute.VirtualMachineSizeTypesBasicA3), + }, }, - }, - KubeDNSOverrides: validLocalDNSOverridePair(v1beta1.LocalDNSForwardDestinationClusterCoreDNS), - } - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) - - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) + } + azureEnv.VirtualMachinesAPI.Instances.Store(lo.FromPtr(vm.ID), *vm) + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + _, err := cloudProvider.Create(ctx, nodeClaim) + Expect(err).ToNot(HaveOccurred()) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine - - Expect(aksMachine.Properties.LocalDNSProfile).ToNot(BeNil()) - - vnetOverride := aksMachine.Properties.LocalDNSProfile.VnetDNSOverrides["."] - Expect(vnetOverride).ToNot(BeNil()) - Expect(lo.FromPtr(vnetOverride.ForwardDestination)).To(Equal(armcontainerservice.LocalDNSForwardDestinationVnetDNS)) - Expect(lo.FromPtr(vnetOverride.QueryLogging)).To(Equal(armcontainerservice.LocalDNSQueryLoggingLog)) - Expect(lo.FromPtr(vnetOverride.Protocol)).To(Equal(armcontainerservice.LocalDNSProtocolForceTCP)) - Expect(lo.FromPtr(vnetOverride.ForwardPolicy)).To(Equal(armcontainerservice.LocalDNSForwardPolicyRoundRobin)) - Expect(lo.FromPtr(vnetOverride.MaxConcurrent)).To(Equal(int32(50))) - Expect(lo.FromPtr(vnetOverride.CacheDurationInSeconds)).To(Equal(int32(30))) - Expect(lo.FromPtr(vnetOverride.ServeStaleDurationInSeconds)).To(Equal(int32(60))) - Expect(lo.FromPtr(vnetOverride.ServeStale)).To(Equal(armcontainerservice.LocalDNSServeStaleImmediate)) + ExpectCSENotProvisioned(azureEnv) + }) }) + }) - It("should set LocalDNSProfile with mode Disabled", func() { - nodeClass.Spec.LocalDNS = &v1beta1.LocalDNS{ - Mode: v1beta1.LocalDNSModeDisabled, - VnetDNSOverrides: validLocalDNSOverridePair(v1beta1.LocalDNSForwardDestinationVnetDNS), - KubeDNSOverrides: validLocalDNSOverridePair(v1beta1.LocalDNSForwardDestinationClusterCoreDNS), - } - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) - - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) + Context("ProvisionMode = AKSScriptless, ManageExistingAKSMachines = false", func() { + BeforeEach(func() { + testOptions = test.Options(test.OptionsFields{ + ProvisionMode: lo.ToPtr(consts.ProvisionModeAKSScriptless), + ManageExistingAKSMachines: lo.ToPtr(false), + }) + ctx = coreoptions.ToContext(ctx, coretest.Options()) + ctx = options.ToContext(ctx, testOptions) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine + azureEnv = test.NewEnvironment(ctx, env) + azureEnvNonZonal = test.NewEnvironmentNonZonal(ctx, env) + statusController = status.NewController(env.Client, azureEnv.KubernetesVersionProvider, azureEnv.ImageProvider, env.KubernetesInterface, env.KubernetesInterface, azureEnv.DynamicInterface, azureEnv.SubnetsAPI, azureEnv.DiskEncryptionSetsAPI, testOptions.ParsedDiskEncryptionSetID, options.FromContext(ctx).NetworkPolicy, options.FromContext(ctx).NetworkPlugin) + test.ApplyDefaultStatus(nodeClass, env, testOptions.UseSIG) + cloudProvider = New(azureEnv.InstanceTypesProvider, azureEnv.VMInstanceProvider, azureEnv.AKSMachineProvider, recorder, env.Client, azureEnv.ImageProvider, azureEnv.InstanceTypeStore) + cloudProviderNonZonal = New(azureEnvNonZonal.InstanceTypesProvider, azureEnvNonZonal.VMInstanceProvider, azureEnvNonZonal.AKSMachineProvider, events.NewRecorder(&record.FakeRecorder{}), env.Client, azureEnvNonZonal.ImageProvider, azureEnvNonZonal.InstanceTypeStore) - Expect(aksMachine.Properties.LocalDNSProfile).ToNot(BeNil()) - Expect(lo.FromPtr(aksMachine.Properties.LocalDNSProfile.Mode)).To(Equal(armcontainerservice.LocalDNSModeDisabled)) - }) + cluster = state.NewCluster(fakeClock, env.Client, cloudProvider) + clusterNonZonal = state.NewCluster(fakeClock, env.Client, cloudProviderNonZonal) + coreProvisioner = provisioning.NewProvisioner(env.Client, recorder, cloudProvider, cluster, fakeClock) + coreProvisionerNonZonal = provisioning.NewProvisioner(env.Client, recorder, cloudProviderNonZonal, clusterNonZonal, fakeClock) - It("should rewrite Preferred to Required on the wire when Status.LocalDNSState=Enabled", func() { - // Preferred is never sent downstream — Karpenter is the only kube-aware - // resolver, so ResolvedLocalDNSForWire rewrites Mode to the terminal - // value implied by Status.LocalDNSState. Enabled => Required. - nodeClass.Spec.LocalDNS = &v1beta1.LocalDNS{ - Mode: v1beta1.LocalDNSModePreferred, - VnetDNSOverrides: validLocalDNSOverridePair(v1beta1.LocalDNSForwardDestinationVnetDNS), - KubeDNSOverrides: validLocalDNSOverridePair(v1beta1.LocalDNSForwardDestinationClusterCoreDNS), - } - nodeClass.Status.LocalDNSState = lo.ToPtr(v1beta1.LocalDNSStateEnabled) - ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectApplied(ctx, env.Client, nodeClass, nodePool) ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + }) - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine - - Expect(aksMachine.Properties.LocalDNSProfile).ToNot(BeNil()) - Expect(lo.FromPtr(aksMachine.Properties.LocalDNSProfile.Mode)).To(Equal(armcontainerservice.LocalDNSModeRequired)) + AfterEach(func() { + // Wait for any async polling goroutines to complete before resetting + cloudProvider.WaitForInstancePromises() + cluster.Reset() + azureEnv.Reset(ctx) + azureEnvNonZonal.Reset(ctx) }) - It("should rewrite Preferred to Disabled on the wire when Status.LocalDNSState is unset", func() { - // Defense-in-depth: if Status hasn't been resolved yet, never pass - // Preferred downstream — the downstream resolver cannot see cluster - // gates and would re-decide incorrectly. Fall back to Disabled. - nodeClass.Spec.LocalDNS = &v1beta1.LocalDNS{ - Mode: v1beta1.LocalDNSModePreferred, - VnetDNSOverrides: validLocalDNSOverridePair(v1beta1.LocalDNSForwardDestinationVnetDNS), - KubeDNSOverrides: validLocalDNSOverridePair(v1beta1.LocalDNSForwardDestinationClusterCoreDNS), - } - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) - // The status sub-reconciler resolves Preferred to Enabled in this - // test env (no cluster conflicts). Wipe LocalDNSState back to nil - // via a status Patch to drive the "Status not yet resolved" - // branch of ResolvedLocalDNSForWire. Re-fetch first because the - // reconcile bumped the resource version. - Expect(env.Client.Get(ctx, client.ObjectKeyFromObject(nodeClass), nodeClass)).To(Succeed()) - stored := nodeClass.DeepCopy() - nodeClass.Status.LocalDNSState = nil - Expect(env.Client.Status().Patch(ctx, nodeClass, client.MergeFrom(stored))).To(Succeed()) + runFeatureTests(aksscriptlessProvisionMode()) + }) - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) + Context("ProvisionMode = AKSScriptless, ManageExistingAKSMachines = true", func() { + BeforeEach(func() { + testOptions = test.Options(test.OptionsFields{ + ProvisionMode: lo.ToPtr(consts.ProvisionModeAKSScriptless), + ManageExistingAKSMachines: lo.ToPtr(true), + }) + ctx = coreoptions.ToContext(ctx, coretest.Options()) + ctx = options.ToContext(ctx, testOptions) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine + azureEnv = test.NewEnvironment(ctx, env) + azureEnvNonZonal = test.NewEnvironmentNonZonal(ctx, env) + statusController = status.NewController(env.Client, azureEnv.KubernetesVersionProvider, azureEnv.ImageProvider, env.KubernetesInterface, env.KubernetesInterface, azureEnv.DynamicInterface, azureEnv.SubnetsAPI, azureEnv.DiskEncryptionSetsAPI, testOptions.ParsedDiskEncryptionSetID, options.FromContext(ctx).NetworkPolicy, options.FromContext(ctx).NetworkPlugin) + test.ApplyDefaultStatus(nodeClass, env, testOptions.UseSIG) + cloudProvider = New(azureEnv.InstanceTypesProvider, azureEnv.VMInstanceProvider, azureEnv.AKSMachineProvider, recorder, env.Client, azureEnv.ImageProvider, azureEnv.InstanceTypeStore) + cloudProviderNonZonal = New(azureEnvNonZonal.InstanceTypesProvider, azureEnvNonZonal.VMInstanceProvider, azureEnvNonZonal.AKSMachineProvider, events.NewRecorder(&record.FakeRecorder{}), env.Client, azureEnvNonZonal.ImageProvider, azureEnvNonZonal.InstanceTypeStore) - Expect(aksMachine.Properties.LocalDNSProfile).ToNot(BeNil()) - Expect(lo.FromPtr(aksMachine.Properties.LocalDNSProfile.Mode)).To(Equal(armcontainerservice.LocalDNSModeDisabled)) - }) + cluster = state.NewCluster(fakeClock, env.Client, cloudProvider) + clusterNonZonal = state.NewCluster(fakeClock, env.Client, cloudProviderNonZonal) + coreProvisioner = provisioning.NewProvisioner(env.Client, recorder, cloudProvider, cluster, fakeClock) + coreProvisionerNonZonal = provisioning.NewProvisioner(env.Client, recorder, cloudProviderNonZonal, clusterNonZonal, fakeClock) - It("should correctly convert KubeDNSOverrides field values", func() { - nodeClass.Spec.LocalDNS = &v1beta1.LocalDNS{ - Mode: v1beta1.LocalDNSModeRequired, - VnetDNSOverrides: validLocalDNSOverridePair(v1beta1.LocalDNSForwardDestinationVnetDNS), - KubeDNSOverrides: []v1beta1.LocalDNSZoneOverride{ - { - Zone: ".", - ForwardDestination: v1beta1.LocalDNSForwardDestinationClusterCoreDNS, - QueryLogging: v1beta1.LocalDNSQueryLoggingLog, - Protocol: v1beta1.LocalDNSProtocolPreferUDP, - ForwardPolicy: v1beta1.LocalDNSForwardPolicySequential, - MaxConcurrent: lo.ToPtr(int32(25)), - CacheDuration: karpv1.MustParseNillableDuration("15s"), - ServeStaleDuration: karpv1.MustParseNillableDuration("45s"), - ServeStale: v1beta1.LocalDNSServeStaleVerify, - }, - validLocalDNSZoneOverride("cluster.local", v1beta1.LocalDNSForwardDestinationClusterCoreDNS), - }, - } - ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectApplied(ctx, env.Client, nodeClass, nodePool) ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + }) - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine - - Expect(aksMachine.Properties.LocalDNSProfile).ToNot(BeNil()) - Expect(aksMachine.Properties.LocalDNSProfile.KubeDNSOverrides).To(HaveLen(2)) - - kubeOverride := aksMachine.Properties.LocalDNSProfile.KubeDNSOverrides["."] - Expect(kubeOverride).ToNot(BeNil()) - Expect(lo.FromPtr(kubeOverride.ForwardDestination)).To(Equal(armcontainerservice.LocalDNSForwardDestinationClusterCoreDNS)) - Expect(lo.FromPtr(kubeOverride.QueryLogging)).To(Equal(armcontainerservice.LocalDNSQueryLoggingLog)) - Expect(lo.FromPtr(kubeOverride.Protocol)).To(Equal(armcontainerservice.LocalDNSProtocolPreferUDP)) - Expect(lo.FromPtr(kubeOverride.ForwardPolicy)).To(Equal(armcontainerservice.LocalDNSForwardPolicySequential)) - Expect(lo.FromPtr(kubeOverride.MaxConcurrent)).To(Equal(int32(25))) - Expect(lo.FromPtr(kubeOverride.CacheDurationInSeconds)).To(Equal(int32(15))) - Expect(lo.FromPtr(kubeOverride.ServeStaleDurationInSeconds)).To(Equal(int32(45))) - Expect(lo.FromPtr(kubeOverride.ServeStale)).To(Equal(armcontainerservice.LocalDNSServeStaleVerify)) + AfterEach(func() { + // Wait for any async polling goroutines to complete before resetting + cloudProvider.WaitForInstancePromises() + cluster.Reset() + azureEnv.Reset(ctx) + azureEnvNonZonal.Reset(ctx) }) + + runFeatureTests(aksscriptlessProvisionMode()) }) -} -var _ = Describe("CloudProvider", func() { Context("ProvisionMode = AKSMachineAPIHeaderBatch", func() { BeforeEach(func() { testOptions = test.Options(test.OptionsFields{ @@ -1217,7 +2582,8 @@ var _ = Describe("CloudProvider", func() { azureEnv.Reset(ctx) azureEnvNonZonal.Reset(ctx) }) - runAKSMachineAPIFeatureTests() + + runFeatureTests(aksMachineAPIHeaderBatchProvisionMode()) }) }) diff --git a/pkg/cloudprovider/suite_integration_test.go b/pkg/cloudprovider/suite_integration_test.go index a04bedf42..1c190c4b0 100644 --- a/pkg/cloudprovider/suite_integration_test.go +++ b/pkg/cloudprovider/suite_integration_test.go @@ -18,16 +18,27 @@ package cloudprovider // TODO v1beta1 extra refactor into suite_test.go / cloudprovider_test.go import ( + "fmt" + "net/http" + "time" + + sdkerrors "github.com/Azure/azure-sdk-for-go-extensions/pkg/errors" + "github.com/Azure/karpenter-provider-azure/pkg/providers/instance" . "github.com/Azure/karpenter-provider-azure/pkg/test/expectations" "github.com/awslabs/operatorpkg/object" + corestatus "github.com/awslabs/operatorpkg/status" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" "github.com/samber/lo" v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/tools/record" + "sigs.k8s.io/controller-runtime/pkg/client" karpv1 "sigs.k8s.io/karpenter/pkg/apis/v1" + karpv1alpha1 "sigs.k8s.io/karpenter/pkg/apis/v1alpha1" corecloudprovider "sigs.k8s.io/karpenter/pkg/cloudprovider" + "sigs.k8s.io/karpenter/pkg/controllers/nodeoverlay" "sigs.k8s.io/karpenter/pkg/controllers/provisioning" "sigs.k8s.io/karpenter/pkg/controllers/state" "sigs.k8s.io/karpenter/pkg/events" @@ -48,16 +59,7 @@ import ( "github.com/Azure/karpenter-provider-azure/pkg/utils/zones" ) -func validateAKSMachineNodeClaim(nodeClaim *karpv1.NodeClaim, nodePool *karpv1.NodePool) { - // Common validations - validateNodeClaimCommon(nodeClaim, nodePool) - - // AKS-specific annotations - Expect(nodeClaim.Annotations).To(HaveKey(v1beta1.AnnotationAKSMachineResourceID)) - Expect(nodeClaim.Annotations[v1beta1.AnnotationAKSMachineResourceID]).ToNot(BeEmpty()) -} - -func runSharedProvisionModeIntegrationTests(provisionMode provisionModeTestCase) { +func runIntegrationTests(provisionMode provisionModeTestCase) { It("should be able to handle basic operations", func() { ExpectApplied(ctx, env.Client, nodeClass, nodePool) @@ -114,212 +116,513 @@ func runSharedProvisionModeIntegrationTests(provisionMode provisionModeTestCase) resetCreateCalls: provisionMode.resetCreateCalls, expectCreateCalls: provisionMode.expectCreateCalls, }) + + Context("Create - CloudProvider Create Error Cases", func() { + It("should return error when NodeClass readiness is Unknown", func() { + nodeClass.StatusConditions().SetUnknown(corestatus.ConditionReady) + testNodeClaim := coretest.NodeClaim(karpv1.NodeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + karpv1.NodePoolLabelKey: nodePool.Name, + }, + }, + Spec: karpv1.NodeClaimSpec{ + NodeClassRef: &karpv1.NodeClassReference{ + Name: nodeClass.Name, + Group: object.GVK(nodeClass).Group, + Kind: object.GVK(nodeClass).Kind, + }, + }, + }) + + ExpectApplied(ctx, env.Client, nodePool, nodeClass, testNodeClaim) + claim, err := CreateAndWaitForPromises(ctx, cloudProvider, azureEnv, testNodeClaim) + Expect(err).To(HaveOccurred()) + Expect(err).To(BeAssignableToTypeOf(&corecloudprovider.CreateError{})) + Expect(claim).To(BeNil()) + Expect(err.Error()).To(ContainSubstring("resolving NodeClass readiness, NodeClass is in Ready=Unknown")) + }) + + It("should return error when instance creation fails", func() { + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + + testNodeClaim := coretest.NodeClaim(karpv1.NodeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + karpv1.NodePoolLabelKey: nodePool.Name, + }, + }, + Spec: karpv1.NodeClaimSpec{ + NodeClassRef: &karpv1.NodeClassReference{ + Name: nodeClass.Name, + Group: object.GVK(nodeClass).Group, + Kind: object.GVK(nodeClass).Kind, + }, + }, + }) + + expectedErrorMessage := "creating instance failed" + if provisionMode.isAKSMachineMode() { + azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorAny() + expectedErrorMessage = "creating AKS machine failed" + } else { + azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set( + &azcore.ResponseError{ + ErrorCode: sdkerrors.OperationNotAllowed, + RawResponse: &http.Response{ + Body: createSDKErrorBody(sdkerrors.OperationNotAllowed, "Failed to create VM"), + }, + }, + ) + } + + claim, err := CreateAndWaitForPromises(ctx, cloudProvider, azureEnv, testNodeClaim) + Expect(err).To(HaveOccurred()) + Expect(err).To(BeAssignableToTypeOf(&corecloudprovider.CreateError{})) + Expect(claim).To(BeNil()) + Expect(err.Error()).To(ContainSubstring(expectedErrorMessage)) + }) + + It("should return error when instance type resolution fails", func() { + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + azureEnv.InstanceTypesProvider.Reset() + + testNodeClaim := coretest.NodeClaim(karpv1.NodeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + karpv1.NodePoolLabelKey: nodePool.Name, + }, + }, + Spec: karpv1.NodeClaimSpec{ + NodeClassRef: &karpv1.NodeClassReference{ + Name: nodeClass.Name, + Group: object.GVK(nodeClass).Group, + Kind: object.GVK(nodeClass).Kind, + }, + }, + }) + + claim, err := CreateAndWaitForPromises(ctx, cloudProvider, azureEnv, testNodeClaim) + Expect(err).To(HaveOccurred()) + Expect(err).To(BeAssignableToTypeOf(&corecloudprovider.CreateError{})) + Expect(claim).To(BeNil()) + Expect(err.Error()).To(ContainSubstring("resolving instance types")) + + Expect(azureEnv.InstanceTypesProvider.UpdateInstanceTypes(ctx)).To(Succeed()) + }) + + It("should return an ICE error when there are no instance types to launch", func() { + // Specify no instance types and expect to receive a capacity error + nodeClaim.Spec.Requirements = []karpv1.NodeSelectorRequirementWithMinValues{ + { + Key: v1.LabelInstanceTypeStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{"doesnotexist"}, // will not match any instance types, + }, + } + + ExpectApplied(ctx, env.Client, nodePool, nodeClass, nodeClaim) + cloudProviderMachine, err := CreateAndWaitForPromises(ctx, cloudProvider, azureEnv, nodeClaim) + Expect(corecloudprovider.IsInsufficientCapacityError(err)).To(BeTrue()) + Expect(cloudProviderMachine).To(BeNil()) + }) + + if !provisionMode.isAKSMachineMode() { + // TODO: share this with Machine API mode + It("should not reattempt creation of a vm thats been created before", func() { + nodeClaim := coretest.NodeClaim(karpv1.NodeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{karpv1.NodePoolLabelKey: nodePool.Name}, + }, + Spec: karpv1.NodeClaimSpec{NodeClassRef: &karpv1.NodeClassReference{Name: nodeClass.Name}}, + }) + vmName := instance.GenerateResourceName(nodeClaim.Name) + vm := &armcompute.VirtualMachine{ + Name: lo.ToPtr(vmName), + ID: lo.ToPtr(fake.MkVMID(options.FromContext(ctx).NodeResourceGroup, vmName)), + Location: lo.ToPtr(fake.Region), + Zones: []*string{lo.ToPtr("fantasy-zone")}, + Properties: &armcompute.VirtualMachineProperties{ + TimeCreated: lo.ToPtr(time.Now()), + HardwareProfile: &armcompute.HardwareProfile{ + VMSize: lo.ToPtr(armcompute.VirtualMachineSizeTypesBasicA3), + }, + }, + } + azureEnv.VirtualMachinesAPI.Instances.Store(lo.FromPtr(vm.ID), *vm) + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + _, err := CreateAndWaitForPromises(ctx, cloudProvider, azureEnv, nodeClaim) + Expect(err).ToNot(HaveOccurred()) + }) + + // NIC handling is delegated to Machine API + It("should delete the network interface on failure to create the vm", func() { + errMsg := "test error" + errCode := fmt.Sprint(http.StatusNotFound) + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set( + &azcore.ResponseError{ + ErrorCode: errCode, + RawResponse: &http.Response{ + Body: createSDKErrorBody(errCode, errMsg), + }, + }, + ) + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectNotScheduled(ctx, env.Client, pod) + + Expect(azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + nic := azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Pop() + Expect(nic).NotTo(BeNil()) + _, ok := azureEnv.NetworkInterfacesAPI.NetworkInterfaces.Load(lo.FromPtr(nic.Interface.ID)) + Expect(ok).To(Equal(false)) + + azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set(nil) + pod = coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectScheduled(ctx, env.Client, pod) + }) + } + }) + + runUnhappyPathHandlingTests(provisionMode) } -func runAKSMachineAPIIntegrationTests() { - // XPMT: TODO(comtalyst): deep inspection test on simulating all of these? +func reconcileCapacityOverlay(customResource v1.ResourceName, overlayCapacity resource.Quantity) { + GinkgoHelper() + nodeOverlay := coretest.NodeOverlay(karpv1alpha1.NodeOverlay{ + Spec: karpv1alpha1.NodeOverlaySpec{ + Requirements: []karpv1alpha1.NodeSelectorRequirement{{ + Key: karpv1.NodePoolLabelKey, + Operator: v1.NodeSelectorOpIn, + Values: []string{nodePool.Name}, + }}, + Capacity: v1.ResourceList{customResource: overlayCapacity}, + }, + }) + ExpectApplied(ctx, env.Client, nodeOverlay) + nodeOverlayController := nodeoverlay.NewController(env.Client, cloudProvider, azureEnv.InstanceTypeStore, cluster) + ExpectReconcileSucceeded(ctx, nodeOverlayController, client.ObjectKeyFromObject(nodeOverlay)) +} + +type nodeOverlayCapacityTestOptions struct { + validateNodeClaim func(*karpv1.NodeClaim) + resetCreateCalls func() + expectCreateCalls func() +} + +func runNodeOverlayCapacityTests(testOptions nodeOverlayCapacityTestOptions) { + Context("NodeOverlay", func() { + It("should launch a NodeClaim that requests capacity added by a NodeOverlay", func() { + ctx = coreoptions.ToContext(ctx, coretest.Options(coretest.OptionsFields{ + FeatureGates: coretest.FeatureGates{NodeOverlay: lo.ToPtr(true)}, + })) + customResource := v1.ResourceName("example.com/dongle") + overlayCapacity := resource.MustParse("100") + nodeClaim.Spec.Resources.Requests = v1.ResourceList{customResource: resource.MustParse("1")} + + ExpectApplied(ctx, env.Client, nodeClass, nodePool, nodeClaim) + reconcileCapacityOverlay(customResource, overlayCapacity) + + if testOptions.resetCreateCalls != nil { + testOptions.resetCreateCalls() + } + cloudProviderMachine, err := CreateAndWaitForPromises(ctx, cloudProvider, azureEnv, nodeClaim) + Expect(err).ToNot(HaveOccurred()) + Expect(cloudProviderMachine).ToNot(BeNil()) + if testOptions.validateNodeClaim != nil { + testOptions.validateNodeClaim(cloudProviderMachine) + } + if testOptions.expectCreateCalls != nil { + testOptions.expectCreateCalls() + } + capacity, ok := cloudProviderMachine.Status.Capacity[customResource] + Expect(ok).To(BeTrue()) + Expect(capacity.Cmp(overlayCapacity)).To(Equal(0)) + allocatable, ok := cloudProviderMachine.Status.Allocatable[customResource] + Expect(ok).To(BeTrue()) + Expect(allocatable.Cmp(overlayCapacity)).To(Equal(0)) + }) + + It("should not use overlaid capacity when NodeOverlay is disabled", func() { + // Explicitly disable the NodeOverlay feature gate so this test does not + // depend on ordering with the previous It block that enables it. + ctx = coreoptions.ToContext(ctx, coretest.Options(coretest.OptionsFields{ + FeatureGates: coretest.FeatureGates{NodeOverlay: lo.ToPtr(false)}, + })) + customResource := v1.ResourceName("example.com/dongle") + overlayCapacity := resource.MustParse("100") + nodeClaim.Spec.Resources.Requests = v1.ResourceList{customResource: resource.MustParse("1")} + + ExpectApplied(ctx, env.Client, nodeClass, nodePool, nodeClaim) + reconcileCapacityOverlay(customResource, overlayCapacity) + + if testOptions.resetCreateCalls != nil { + testOptions.resetCreateCalls() + } + cloudProviderMachine, err := CreateAndWaitForPromises(ctx, cloudProvider, azureEnv, nodeClaim) + Expect(corecloudprovider.IsInsufficientCapacityError(err)).To(BeTrue()) + Expect(cloudProviderMachine).To(BeNil()) + }) + }) +} + +func runUnhappyPathHandlingTests(provisionMode provisionModeTestCase) { Context("Unexpected API Failures", func() { - It("should handle AKS machine create failures - unrecognized error during sync/initial", func() { - // Set up error to occur immediately during BeginCreateOrUpdate call - azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.BeginError.Set(fake.AKSMachineAPIErrorAny) + It("should handle create failures - unrecognized error during sync/initial", func() { + // Set up error to occur immediately during create. + if provisionMode.isAKSMachineMode() { + azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.BeginError.Set(fake.AKSMachineAPIErrorAny) + } else { + azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set(fake.AKSMachineAPIErrorAny) + } ExpectApplied(ctx, env.Client, nodeClass, nodePool) pod := coretest.UnschedulablePod() ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) ExpectNotScheduled(ctx, env.Client, pod) - // Verify the create API was called but failed - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - - // Verify the cleanup was attempted - Expect(azureEnv.AKSAgentPoolsAPI.AgentPoolDeleteMachinesBehavior.CalledWithInput.Len()).To(Equal(1)) - - // Clear the error for cleanup - azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.BeginError.Set(nil) + // Verify the create API was called and cleanup was attempted where applicable. + if provisionMode.isAKSMachineMode() { + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + Expect(azureEnv.AKSAgentPoolsAPI.AgentPoolDeleteMachinesBehavior.CalledWithInput.Len()).To(Equal(1)) + azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.BeginError.Set(nil) + } else { + Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + Expect(azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set(nil) + } - // Verify the pod is now schedulable + // Verify provisioning works again after clearing the error. pod2 := coretest.UnschedulablePod() ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod2) ExpectScheduled(ctx, env.Client, pod2) }) - It("should handle AKS machine create failures - unrecognized error during async/LRO", func() { - // Set up error to occur during LRO polling (async failure) - azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorAny() + It("should handle create failures - unrecognized error during async/LRO", func() { + // Set up error to occur during async provisioning. + if provisionMode.isAKSMachineMode() { + // WARNING: This fake currently surfaces through the immediate post-create GET, not the AKSMachine async poller. + // TODO: Make AfterPollProvisioningErrorOverride fail through the AKSMachine async poller path instead. + azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorAny() + } else { + azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.Error.Set(fake.AKSMachineAPIErrorAny) + } ExpectApplied(ctx, env.Client, nodeClass, nodePool) pod := coretest.UnschedulablePod() ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectNotScheduled(ctx, env.Client, pod) - - // Verify the create API was called but failed - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - - // Verify the cleanup was attempted - Expect(azureEnv.AKSAgentPoolsAPI.AgentPoolDeleteMachinesBehavior.CalledWithInput.Len()).To(Equal(1)) - // Clear the error for cleanup - azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = nil + if provisionMode.isAKSMachineMode() { + ExpectNotScheduled(ctx, env.Client, pod) + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + Expect(azureEnv.AKSAgentPoolsAPI.AgentPoolDeleteMachinesBehavior.CalledWithInput.Len()).To(Equal(1)) + azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = nil + } else { + // Problem: async failure doesn't affect schedulability due to the limitations of core test framework + // Machine API mode doesn't have the problem because a different problem of async failure simulation get caught in post-create GET hides it (details above) + //ExpectNotScheduled(ctx, env.Client, pod) + // Cleanup is invoked, but this fake async VM failure returns the poller error before storing a fake VM, so VM-provider Delete sees not found and no VM BeginDelete call is observable. + // TODO: Make the fake async VM failure store the VM before poll failure so this test can validate VM cleanup like Machine API validates DeleteMachines below. + Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.Error.Set(nil) + } - // Verify the pod is now schedulable + // Verify provisioning works again after clearing the error. pod2 := coretest.UnschedulablePod() ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod2) ExpectScheduled(ctx, env.Client, pod2) }) - It("should handle AKS machine get failures - unrecognized error", func() { - // First create a successful AKS machine + It("should handle get failures - unrecognized error", func() { + // First create a successful nodeclaim. ExpectApplied(ctx, env.Client, nodeClass, nodePool) pod := coretest.UnschedulablePod() ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) ExpectScheduled(ctx, env.Client, pod) - // Get the created nodeclaim + // Get the created nodeclaim. nodeClaims, err := cloudProvider.List(ctx) Expect(err).ToNot(HaveOccurred()) Expect(nodeClaims).To(HaveLen(1)) - validateAKSMachineNodeClaim(nodeClaims[0], nodePool) + provisionMode.validateNodeClaim(nodeClaims[0]) - // Set up Get to fail - azureEnv.AKSMachinesAPI.AKSMachineGetBehavior.Error.Set(fake.AKSMachineAPIErrorAny) + // Set up get to fail. + provisionMode.resetGetCalls() + if provisionMode.isAKSMachineMode() { + azureEnv.AKSMachinesAPI.AKSMachineGetBehavior.Error.Set(fake.AKSMachineAPIErrorAny) + } else { + azureEnv.VirtualMachinesAPI.VirtualMachineGetBehavior.Error.Set(fake.AKSMachineAPIErrorAny) + } - // Attempt to get the nodeclaim - should fail + // Attempt to get the nodeclaim - should fail. retrievedNodeClaim, err := cloudProvider.Get(ctx, nodeClaims[0].Status.ProviderID) Expect(err).To(HaveOccurred()) Expect(retrievedNodeClaim).To(BeNil()) - // Verify the get API was called - Expect(azureEnv.AKSMachinesAPI.AKSMachineGetBehavior.CalledWithInput.Len()).To(BeNumerically(">=", 1)) - // Clear the error for cleanup - azureEnv.AKSMachinesAPI.AKSMachineGetBehavior.Error.Set(nil) + // Verify the get API was called, then clear the error for cleanup. + if provisionMode.isAKSMachineMode() { + Expect(azureEnv.AKSMachinesAPI.AKSMachineGetBehavior.CalledWithInput.Len()).To(BeNumerically(">=", 1)) + azureEnv.AKSMachinesAPI.AKSMachineGetBehavior.Error.Set(nil) + } else { + Expect(azureEnv.VirtualMachinesAPI.VirtualMachineGetBehavior.CalledWithInput.Len()).To(BeNumerically(">=", 1)) + azureEnv.VirtualMachinesAPI.VirtualMachineGetBehavior.Error.Set(nil) + } }) - It("should handle AKS machine delete failures - unrecognized error during sync/initial", func() { - // First create a successful AKS machine + It("should handle delete failures - unrecognized error during sync/initial", func() { + // First create a successful nodeclaim. ExpectApplied(ctx, env.Client, nodeClass, nodePool) pod := coretest.UnschedulablePod() ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) ExpectScheduled(ctx, env.Client, pod) - // Get the created nodeclaim + // Get the created nodeclaim. nodeClaims, err := cloudProvider.List(ctx) Expect(err).ToNot(HaveOccurred()) Expect(nodeClaims).To(HaveLen(1)) - validateAKSMachineNodeClaim(nodeClaims[0], nodePool) + provisionMode.validateNodeClaim(nodeClaims[0]) - // Set up delete to fail - azureEnv.AKSAgentPoolsAPI.AgentPoolDeleteMachinesBehavior.BeginError.Set(fake.AKSMachineAPIErrorAny) + // Set up delete to fail. + provisionMode.resetDeleteCalls() + if provisionMode.isAKSMachineMode() { + azureEnv.AKSAgentPoolsAPI.AgentPoolDeleteMachinesBehavior.BeginError.Set(fake.AKSMachineAPIErrorAny) + } else { + azureEnv.VirtualMachinesAPI.VirtualMachineDeleteBehavior.BeginError.Set(fake.AKSMachineAPIErrorAny) + } - // Attempt to delete the nodeclaim - should fail + // Attempt to delete the nodeclaim - should fail. err = cloudProvider.Delete(ctx, nodeClaims[0]) Expect(err).To(HaveOccurred()) - // Verify the delete API was called - Expect(azureEnv.AKSAgentPoolsAPI.AgentPoolDeleteMachinesBehavior.CalledWithInput.Len()).To(Equal(1)) - // Clear the error for cleanup - azureEnv.AKSAgentPoolsAPI.AgentPoolDeleteMachinesBehavior.BeginError.Set(nil) + // Verify the delete API was called, then clear the error for cleanup. + if provisionMode.isAKSMachineMode() { + Expect(azureEnv.AKSAgentPoolsAPI.AgentPoolDeleteMachinesBehavior.CalledWithInput.Len()).To(Equal(1)) + azureEnv.AKSAgentPoolsAPI.AgentPoolDeleteMachinesBehavior.BeginError.Set(nil) + } else { + Expect(azureEnv.VirtualMachinesAPI.VirtualMachineDeleteBehavior.CalledWithInput.Len()).To(Equal(1)) + azureEnv.VirtualMachinesAPI.VirtualMachineDeleteBehavior.BeginError.Set(nil) + } }) - It("should handle AKS machine delete failures - unrecognized error during async/LRO", func() { - // First create a successful AKS machine + It("should handle delete failures - unrecognized error during async/LRO", func() { + // First create a successful nodeclaim. ExpectApplied(ctx, env.Client, nodeClass, nodePool) pod := coretest.UnschedulablePod() ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) ExpectScheduled(ctx, env.Client, pod) - // Get the created nodeclaim + // Get the created nodeclaim. nodeClaims, err := cloudProvider.List(ctx) Expect(err).ToNot(HaveOccurred()) Expect(nodeClaims).To(HaveLen(1)) - validateAKSMachineNodeClaim(nodeClaims[0], nodePool) + provisionMode.validateNodeClaim(nodeClaims[0]) - // Set up delete to fail - azureEnv.AKSAgentPoolsAPI.AgentPoolDeleteMachinesBehavior.Error.Set(fake.AKSMachineAPIErrorAny) + // Set up delete to fail. + provisionMode.resetDeleteCalls() + if provisionMode.isAKSMachineMode() { + azureEnv.AKSAgentPoolsAPI.AgentPoolDeleteMachinesBehavior.Error.Set(fake.AKSMachineAPIErrorAny) + } else { + azureEnv.VirtualMachinesAPI.VirtualMachineDeleteBehavior.Error.Set(fake.AKSMachineAPIErrorAny) + } - // Attempt to delete the nodeclaim - should fail + // Attempt to delete the nodeclaim - should fail. err = cloudProvider.Delete(ctx, nodeClaims[0]) Expect(err).To(HaveOccurred()) - // Verify the delete API was called - Expect(azureEnv.AKSAgentPoolsAPI.AgentPoolDeleteMachinesBehavior.CalledWithInput.Len()).To(Equal(1)) - // Clear the error for cleanup - azureEnv.AKSAgentPoolsAPI.AgentPoolDeleteMachinesBehavior.Error.Set(nil) + // Verify the delete API was called, then clear the error for cleanup. + if provisionMode.isAKSMachineMode() { + Expect(azureEnv.AKSAgentPoolsAPI.AgentPoolDeleteMachinesBehavior.CalledWithInput.Len()).To(Equal(1)) + azureEnv.AKSAgentPoolsAPI.AgentPoolDeleteMachinesBehavior.Error.Set(nil) + } else { + Expect(azureEnv.VirtualMachinesAPI.VirtualMachineDeleteBehavior.CalledWithInput.Len()).To(Equal(1)) + azureEnv.VirtualMachinesAPI.VirtualMachineDeleteBehavior.Error.Set(nil) + } }) - It("should handle AKS machine list failures - unrecognized error", func() { - if testOptions.ProvisionMode == consts.ProvisionModeAKSMachineAPIHeaderBatch { - // Under batch, List must work for PollUntilDone (provisioning) to complete. - // Testing "List fails" requires a batch-specific test that expects provisioning failure, - // not this test which assumes provisioning succeeds then List fails afterward. - Skip("Batch provisioning depends on List to work") + It("should handle list failures - unrecognized error", func() { + // Under batch, List must work for PollUntilDone (provisioning) to complete. + // Testing "List fails" requires a batch-specific test that expects provisioning failure, + // not this test which assumes provisioning succeeds then List fails afterward. + if provisionMode.isAKSMachineAPIHeaderBatchMode() { + Skip("header-batch mode lists AKS machines during provisioning") } - // Set up error to occur during the NextPage call - azureEnv.AKSMachinesAPI.AKSMachineNewListPagerBehavior.Error.Set(fake.AKSMachineAPIErrorAny) + // First create a successful nodeclaim. ExpectApplied(ctx, env.Client, nodeClass, nodePool) pod := coretest.UnschedulablePod() ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) ExpectScheduled(ctx, env.Client, pod) - // Verify the list API was called but failed - azureEnv.AKSAgentPoolsAPI.AgentPoolGetBehavior.CalledWithInput.Reset() + provisionMode.resetListCalls() + if provisionMode.isAKSMachineMode() { + azureEnv.AKSMachinesAPI.AKSMachineNewListPagerBehavior.Error.Set(fake.AKSMachineAPIErrorAny) + } else { + azureEnv.AzureResourceGraphAPI.AzureResourceGraphResourcesBehavior.Error.Set(fake.AKSMachineAPIErrorAny) + } + nodeClaims, err := cloudProvider.List(ctx) Expect(err).To(HaveOccurred()) Expect(nodeClaims).To(BeEmpty()) - Expect(azureEnv.AKSMachinesAPI.AKSMachineNewListPagerBehavior.CalledWithInput.Len()).To(BeNumerically(">=", 1)) - // Clear the error for cleanup - azureEnv.AKSMachinesAPI.AKSMachineNewListPagerBehavior.Error.Set(nil) + // Verify the list API was called, then clear the error for cleanup. + if provisionMode.isAKSMachineMode() { + Expect(azureEnv.AKSMachinesAPI.AKSMachineNewListPagerBehavior.CalledWithInput.Len()).To(BeNumerically(">=", 1)) + azureEnv.AKSMachinesAPI.AKSMachineNewListPagerBehavior.Error.Set(nil) + } else { + Expect(azureEnv.AzureResourceGraphAPI.AzureResourceGraphResourcesBehavior.CalledWithInput.Len()).To(BeNumerically(">=", 1)) + azureEnv.AzureResourceGraphAPI.AzureResourceGraphResourcesBehavior.Error.Set(nil) + } - // Verify the pod is now schedulable + // Verify provisioning works again after clearing the error. pod2 := coretest.UnschedulablePod() ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod2) ExpectScheduled(ctx, env.Client, pod2) }) }) + // We currently don't support changing immutable offerings requirements for an already-created nodeclaim name. Context("Operation Conflicts/Races", func() { - It("should handle AKS machine get/delete failures - not found/already deleted externally", func() { - // First create a successful AKS machine + It("should handle get/delete failures - not found/already deleted externally", func() { ExpectApplied(ctx, env.Client, nodeClass, nodePool) pod := coretest.UnschedulablePod() ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) ExpectScheduled(ctx, env.Client, pod) - // Get the created nodeclaim nodeClaims, err := cloudProvider.List(ctx) Expect(err).ToNot(HaveOccurred()) Expect(nodeClaims).To(HaveLen(1)) - validateAKSMachineNodeClaim(nodeClaims[0], nodePool) + provisionMode.validateNodeClaim(nodeClaims[0]) - // Delete the machine directly + // Delete the nodeclaim so the backing instance is gone. err = cloudProvider.Delete(ctx, nodeClaims[0]) Expect(err).ToNot(HaveOccurred()) - // Get should return NodeClaimNotFound error - azureEnv.AKSMachinesAPI.AKSMachineGetBehavior.CalledWithInput.Reset() + provisionMode.resetGetCalls() + // Get should return NodeClaimNotFound after the backing instance is gone. retrievedNodeClaim2, err := cloudProvider.Get(ctx, nodeClaims[0].Status.ProviderID) Expect(err).To(HaveOccurred()) - Expect(azureEnv.AKSMachinesAPI.AKSMachineGetBehavior.CalledWithInput.Len()).To(Equal(1)) + provisionMode.expectGetCalls() Expect(corecloudprovider.IsNodeClaimNotFoundError(err)).To(BeTrue()) Expect(retrievedNodeClaim2).To(BeNil()) - // Delete should also return NodeClaimNotFound error - azureEnv.AKSMachinesAPI.AKSMachineGetBehavior.CalledWithInput.Reset() - azureEnv.AKSAgentPoolsAPI.AgentPoolDeleteMachinesBehavior.CalledWithInput.Reset() + provisionMode.resetGetCalls() + provisionMode.resetDeleteCalls() + // Attempt to delete the nodeclaim again - should fail as not found. err = cloudProvider.Delete(ctx, nodeClaims[0]) Expect(err).To(HaveOccurred()) - Expect(azureEnv.AKSMachinesAPI.AKSMachineGetBehavior.CalledWithInput.Len()).To(Equal(1)) - Expect(azureEnv.AKSAgentPoolsAPI.AgentPoolDeleteMachinesBehavior.CalledWithInput.Len()).To(Equal(0)) // Per current logic, get should be called before delete + provisionMode.expectGetCalls() + if provisionMode.isAKSMachineMode() { + Expect(azureEnv.AKSAgentPoolsAPI.AgentPoolDeleteMachinesBehavior.CalledWithInput.Len()).To(Equal(0)) + } else { + Expect(azureEnv.VirtualMachinesAPI.VirtualMachineDeleteBehavior.CalledWithInput.Len()).To(Equal(0)) + } Expect(corecloudprovider.IsNodeClaimNotFoundError(err)).To(BeTrue()) - - // Clear the error for cleanup - azureEnv.AKSMachinesAPI.AKSMachineGetBehavior.Error.Set(nil) }) - // Note: currently, we do not support different offerings requirements for the NodeClaim with the same name that attempted creation recently. The same applies with VM-based provisioning. - It("should handle AKS machine create - found in get, with the same requirements", func() { - // Create a fresh nodeClaim with explicit requirements so we know exactly what it will have + It("should handle instance create - found in get, with the same requirements", func() { + // Create a fresh nodeclaim with explicit requirements so we know exactly what it will have. firstNodeClaim := coretest.NodeClaim(karpv1.NodeClaim{ ObjectMeta: metav1.ObjectMeta{ Labels: map[string]string{karpv1.NodePoolLabelKey: nodePool.Name}, @@ -345,59 +648,67 @@ func runAKSMachineAPIIntegrationTests() { }, }) - // First create a successful AKS machine using cloudProvider.Create directly + // First create a successful instance using cloudProvider.Create directly. ExpectApplied(ctx, env.Client, nodeClass, nodePool, firstNodeClaim) createdFirstNodeClaim, err := CreateAndWaitForPromises(ctx, cloudProvider, azureEnv, firstNodeClaim) Expect(err).ToNot(HaveOccurred()) Expect(createdFirstNodeClaim).ToNot(BeNil()) - validateAKSMachineNodeClaim(createdFirstNodeClaim, nodePool) + provisionMode.validateNodeClaim(createdFirstNodeClaim) Expect(createdFirstNodeClaim.CreationTimestamp).ToNot(BeZero()) - // Create a conflicted nodeclaim with same configuration + // Create a conflicted nodeclaim with the same configuration. conflictedNodeClaim := firstNodeClaim.DeepCopy() - // Call cloudProvider.Create directly with the unconflicted nodeclaim to trigger get - azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Reset() - azureEnv.AKSMachinesAPI.AKSMachineGetBehavior.CalledWithInput.Reset() - nodeClaim, err = CreateAndWaitForPromises(ctx, cloudProvider, azureEnv, conflictedNodeClaim) - Expect(err).ToNot(HaveOccurred()) - Expect(nodeClaim).ToNot(BeNil()) - - // Verify the AKS machine was reused successfully - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(0)) - // With cache enabled, the GET is served from cache rather than hitting the API directly. - if testOptions.ProvisionMode == consts.ProvisionModeAKSMachineAPIHeaderBatch { - // With cache enabled, the pre-create GET is served from cache — no API call recorded - Expect(azureEnv.AKSMachinesAPI.AKSMachineGetBehavior.CalledWithInput.Len()).To(Equal(0)) + // Reset API call tracking before triggering the reuse path. + if provisionMode.isAKSMachineMode() { + azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Reset() + azureEnv.AKSMachinesAPI.AKSMachineGetBehavior.CalledWithInput.Reset() } else { - Expect(azureEnv.AKSMachinesAPI.AKSMachineGetBehavior.CalledWithInput.Len()).To(Equal(1)) + azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Reset() + azureEnv.VirtualMachinesAPI.VirtualMachineGetBehavior.CalledWithInput.Reset() } - // Since no new machine was created, verify the machine in the fake store matches original config. - aksMachineName := firstNodeClaim.Name - - // Get the actual machine from the fake store - machineID := fake.MkMachineID(testOptions.NodeResourceGroup, testOptions.ClusterName, testOptions.AKSMachinesPoolName, aksMachineName) - existingMachine, ok := azureEnv.AKSDataStorage.AKSMachines.Load(machineID) - Expect(ok).To(BeTrue(), "AKS machine should exist in fake store") - aksMachine := existingMachine - Expect(aksMachine.Properties).ToNot(BeNil()) + // Call Create with the same-named nodeclaim to trigger get/reuse. + createdNodeClaim, err := CreateAndWaitForPromises(ctx, cloudProvider, azureEnv, conflictedNodeClaim) + Expect(err).ToNot(HaveOccurred()) + Expect(createdNodeClaim).ToNot(BeNil()) - // Validate AKS machine properties match the conflicted configuration - Expect(aksMachine.Properties.Hardware).ToNot(BeNil()) - Expect(aksMachine.Properties.Hardware.VMSize).ToNot(BeNil()) - Expect(*aksMachine.Properties.Hardware.VMSize).To(Equal("Standard_D2_v2")) - Expect(aksMachine.Zones).To(HaveLen(1)) - Expect(*aksMachine.Zones[0]).To(Equal("1")) + // Verify the existing instance was reused successfully. + if provisionMode.isAKSMachineMode() { + // With cache enabled, the pre-create GET is served from cache, so no new create is recorded. + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(0)) + // Since no new instance was created, verify the fake store still has the original configuration. + machineID := fake.MkMachineID(testOptions.NodeResourceGroup, testOptions.ClusterName, testOptions.AKSMachinesPoolName, firstNodeClaim.Name) + aksMachine, ok := azureEnv.AKSDataStorage.AKSMachines.Load(machineID) + Expect(ok).To(BeTrue(), "AKS machine should exist in fake store") + Expect(aksMachine.Properties).ToNot(BeNil()) + Expect(aksMachine.Properties.Hardware).ToNot(BeNil()) + Expect(aksMachine.Properties.Hardware.VMSize).ToNot(BeNil()) + Expect(*aksMachine.Properties.Hardware.VMSize).To(Equal("Standard_D2_v2")) + Expect(aksMachine.Zones).To(HaveLen(1)) + Expect(*aksMachine.Zones[0]).To(Equal("1")) + } else { + Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(0)) + Expect(azureEnv.VirtualMachinesAPI.VirtualMachineGetBehavior.CalledWithInput.Len()).To(Equal(1)) + vmName := instance.GenerateResourceName(firstNodeClaim.Name) + vm, ok := azureEnv.VirtualMachinesAPI.Instances.Load(fake.MkVMID(testOptions.NodeResourceGroup, vmName)) + Expect(ok).To(BeTrue(), "VM should exist in fake store") + Expect(vm.Properties).ToNot(BeNil()) + Expect(vm.Properties.HardwareProfile).ToNot(BeNil()) + Expect(vm.Properties.HardwareProfile.VMSize).ToNot(BeNil()) + Expect(string(lo.FromPtr(vm.Properties.HardwareProfile.VMSize))).To(Equal("Standard_D2_v2")) + Expect(vm.Zones).To(HaveLen(1)) + Expect(lo.FromPtr(vm.Zones[0])).To(Equal("1")) + } - // Validate the returned nodeClaim has correct configuration - validateAKSMachineNodeClaim(nodeClaim, nodePool) - Expect(nodeClaim.Labels[v1.LabelTopologyZone]).To(Equal(zones.MakeAKSLabelZoneFromARMZone(fake.Region, "1"))) - Expect(nodeClaim.Labels[v1.LabelInstanceTypeStable]).To(Equal("Standard_D2_v2")) + // Validate the returned nodeclaim has the expected configuration. + provisionMode.validateNodeClaim(createdNodeClaim) + Expect(createdNodeClaim.Labels[v1.LabelTopologyZone]).To(Equal(zones.MakeAKSLabelZoneFromARMZone(fake.Region, "1"))) + Expect(createdNodeClaim.Labels[v1.LabelInstanceTypeStable]).To(Equal("Standard_D2_v2")) }) - It("should handle AKS machine create failures - not found in get, but somehow found during create, although with same configuration", func() { - // Create a fresh nodeClaim with explicit requirements so we know exactly what it will have + It("should handle instance create - not found in get, but found during create with the same requirements", func() { + // Create a fresh nodeclaim with explicit requirements so we know exactly what it will have. firstNodeClaim := coretest.NodeClaim(karpv1.NodeClaim{ ObjectMeta: metav1.ObjectMeta{ Labels: map[string]string{karpv1.NodePoolLabelKey: nodePool.Name}, @@ -423,48 +734,70 @@ func runAKSMachineAPIIntegrationTests() { }, }) - // First create a successful AKS machine using cloudProvider.Create directly + // First create a successful instance using cloudProvider.Create directly. ExpectApplied(ctx, env.Client, nodeClass, nodePool, firstNodeClaim) createdFirstNodeClaim, err := CreateAndWaitForPromises(ctx, cloudProvider, azureEnv, firstNodeClaim) Expect(err).ToNot(HaveOccurred()) Expect(createdFirstNodeClaim).ToNot(BeNil()) - validateAKSMachineNodeClaim(createdFirstNodeClaim, nodePool) + provisionMode.validateNodeClaim(createdFirstNodeClaim) Expect(createdFirstNodeClaim.CreationTimestamp).ToNot(BeZero()) - // Create a conflicted nodeclaim with same configuration + // Create a conflicted nodeclaim with the same configuration. conflictedNodeClaim := firstNodeClaim.DeepCopy() - // Simulate Get being faulty (or the previous machine comes into exist between get and create) - azureEnv.AKSMachinesAPI.AKSMachineGetBehavior.Error.Set(fake.AKSMachineAPIErrorFromAKSMachineNotFound) - azureEnv.AKSMachineCache.InvalidateAll() // Ensure the cache doesn't serve stale data + // Simulate get missing the existing instance before create finds the same configuration. + if provisionMode.isAKSMachineMode() { + azureEnv.AKSMachinesAPI.AKSMachineGetBehavior.Error.Set(fake.AKSMachineAPIErrorFromAKSMachineNotFound) + azureEnv.AKSMachineCache.InvalidateAll() + azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Reset() + } else { + azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Reset() + vmName := instance.GenerateResourceName(firstNodeClaim.Name) + vm, ok := azureEnv.VirtualMachinesAPI.Instances.Load(fake.MkVMID(testOptions.NodeResourceGroup, vmName)) + Expect(ok).To(BeTrue(), "VM should exist in fake store") + azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.Output.Set(&armcompute.VirtualMachinesClientCreateOrUpdateResponse{VirtualMachine: vm}) + azureEnv.VirtualMachinesAPI.VirtualMachineGetBehavior.Error.Set(&azcore.ResponseError{StatusCode: http.StatusNotFound}) + } - // Call cloudProvider.Create directly with the unconflicted nodeclaim to trigger empty create - azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Reset() - nodeClaim, err = CreateAndWaitForPromises(ctx, cloudProvider, azureEnv, conflictedNodeClaim) + // Call Create with the same-named nodeclaim to trigger create-after-get-miss handling. + createdNodeClaim, err := CreateAndWaitForPromises(ctx, cloudProvider, azureEnv, conflictedNodeClaim) Expect(err).ToNot(HaveOccurred()) - Expect(nodeClaim).ToNot(BeNil()) - - // Verify the AKS machine was created successfully - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine - Expect(aksMachine.Properties).ToNot(BeNil()) - - // Validate AKS machine properties match the conflicted configuration - Expect(aksMachine.Properties.Hardware).ToNot(BeNil()) - Expect(aksMachine.Properties.Hardware.VMSize).ToNot(BeNil()) - Expect(*aksMachine.Properties.Hardware.VMSize).To(Equal("Standard_D2_v2")) - Expect(aksMachine.Zones).To(HaveLen(1)) - Expect(*aksMachine.Zones[0]).To(Equal("1")) + Expect(createdNodeClaim).ToNot(BeNil()) + + // Verify the instance was created successfully. + if provisionMode.isAKSMachineMode() { + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := createInput.AKSMachine + Expect(aksMachine.Properties).ToNot(BeNil()) + Expect(aksMachine.Properties.Hardware).ToNot(BeNil()) + Expect(aksMachine.Properties.Hardware.VMSize).ToNot(BeNil()) + Expect(*aksMachine.Properties.Hardware.VMSize).To(Equal("Standard_D2_v2")) + Expect(aksMachine.Zones).To(HaveLen(1)) + Expect(*aksMachine.Zones[0]).To(Equal("1")) + } else { + Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.Output.Set(nil) + azureEnv.VirtualMachinesAPI.VirtualMachineGetBehavior.Error.Set(nil) + vmName := instance.GenerateResourceName(firstNodeClaim.Name) + vm, ok := azureEnv.VirtualMachinesAPI.Instances.Load(fake.MkVMID(testOptions.NodeResourceGroup, vmName)) + Expect(ok).To(BeTrue(), "VM should exist in fake store") + Expect(vm.Properties).ToNot(BeNil()) + Expect(vm.Properties.HardwareProfile).ToNot(BeNil()) + Expect(vm.Properties.HardwareProfile.VMSize).ToNot(BeNil()) + Expect(string(lo.FromPtr(vm.Properties.HardwareProfile.VMSize))).To(Equal("Standard_D2_v2")) + Expect(vm.Zones).To(HaveLen(1)) + Expect(lo.FromPtr(vm.Zones[0])).To(Equal("1")) + } - // Validate the returned nodeClaim has correct configuration - validateAKSMachineNodeClaim(nodeClaim, nodePool) - Expect(nodeClaim.Labels[v1.LabelTopologyZone]).To(Equal(zones.MakeAKSLabelZoneFromARMZone(fake.Region, "1"))) - Expect(nodeClaim.Labels[v1.LabelInstanceTypeStable]).To(Equal("Standard_D2_v2")) + // Validate the returned nodeclaim has the expected configuration. + provisionMode.validateNodeClaim(createdNodeClaim) + Expect(createdNodeClaim.Labels[v1.LabelTopologyZone]).To(Equal(zones.MakeAKSLabelZoneFromARMZone(fake.Region, "1"))) + Expect(createdNodeClaim.Labels[v1.LabelInstanceTypeStable]).To(Equal("Standard_D2_v2")) }) - It("should handle AKS machine create failures - not found in get, but somehow found during create, although with conflicted configuration", func() { - // Create a fresh nodeClaim with explicit requirements so we know exactly what it will have + It("should handle instance create - not found in get, but found during create with conflicted requirements", func() { + // Create a fresh nodeclaim with explicit requirements so we know exactly what it will have. firstNodeClaim := coretest.NodeClaim(karpv1.NodeClaim{ ObjectMeta: metav1.ObjectMeta{ Labels: map[string]string{karpv1.NodePoolLabelKey: nodePool.Name}, @@ -490,74 +823,189 @@ func runAKSMachineAPIIntegrationTests() { }, }) - // First create a successful AKS machine using cloudProvider.Create directly + // First create a successful instance using cloudProvider.Create directly. ExpectApplied(ctx, env.Client, nodeClass, nodePool, firstNodeClaim) createdFirstNodeClaim, err := CreateAndWaitForPromises(ctx, cloudProvider, azureEnv, firstNodeClaim) Expect(err).ToNot(HaveOccurred()) Expect(createdFirstNodeClaim).ToNot(BeNil()) - validateAKSMachineNodeClaim(createdFirstNodeClaim, nodePool) + provisionMode.validateNodeClaim(createdFirstNodeClaim) Expect(createdFirstNodeClaim.CreationTimestamp).ToNot(BeZero()) - // Create a conflicted nodeclaim with different immutable configuration (zone/SKU) - conflictedNodeClaim := firstNodeClaim.DeepCopy() - // Change zone to create immutable configuration conflict - conflictedNodeClaim.Spec.Requirements = []karpv1.NodeSelectorRequirementWithMinValues{ - { - Key: v1.LabelTopologyZone, - Operator: v1.NodeSelectorOpIn, - Values: []string{zones.MakeAKSLabelZoneFromARMZone(fake.Region, "2")}, // Different zone, + expectedConflictedVMSize := "Standard_D2_v2" + + // Create a conflicted nodeclaim with different immutable configuration. + // Change zone to create the immutable configuration conflict. + conflictedNodeClaim := coretest.NodeClaim(karpv1.NodeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: firstNodeClaim.Name, + Labels: map[string]string{karpv1.NodePoolLabelKey: nodePool.Name}, }, - { - Key: v1.LabelInstanceTypeStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{"Standard_D2_v5"}, // Different SKU, + Spec: karpv1.NodeClaimSpec{ + NodeClassRef: &karpv1.NodeClassReference{ + Group: object.GVK(nodeClass).Group, + Kind: object.GVK(nodeClass).Kind, + Name: nodeClass.Name, + }, + Requirements: []karpv1.NodeSelectorRequirementWithMinValues{ + { + Key: v1.LabelTopologyZone, + Operator: v1.NodeSelectorOpIn, + Values: []string{zones.MakeAKSLabelZoneFromARMZone(fake.Region, "2")}, + }, + { + Key: v1.LabelInstanceTypeStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{expectedConflictedVMSize}, + }, + }, }, - } - - // Simulate Get being faulty (or the previous machine comes into exist between get and create) - azureEnv.AKSMachinesAPI.AKSMachineGetBehavior.Error.Set(fake.AKSMachineAPIErrorFromAKSMachineNotFound) - azureEnv.AKSMachineCache.InvalidateAll() // Ensure the cache doesn't serve stale data + }) - // Call cloudProvider.Create directly with the conflicted nodeclaim to trigger the race condition - // This targets the same machine name but should fail due to configuration conflict and trigger cleanup - _, err = CreateAndWaitForPromises(ctx, cloudProvider, azureEnv, conflictedNodeClaim) - Expect(err).To(HaveOccurred()) + // Simulate get missing the existing instance before create hits conflicting requirements. + if provisionMode.isAKSMachineMode() { + azureEnv.AKSMachinesAPI.AKSMachineGetBehavior.Error.Set(fake.AKSMachineAPIErrorFromAKSMachineNotFound) + azureEnv.AKSMachineCache.InvalidateAll() + } else { + azureEnv.VirtualMachinesAPI.VirtualMachineGetBehavior.Error.Set(&azcore.ResponseError{StatusCode: http.StatusNotFound}) + azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Reset() + azureEnv.VirtualMachinesAPI.VirtualMachineDeleteBehavior.CalledWithInput.Reset() + } - // Verify cleanup was attempted after the conflict - Expect(azureEnv.AKSAgentPoolsAPI.AgentPoolDeleteMachinesBehavior.CalledWithInput.Len()).To(BeNumerically(">=", 1)) + // Call Create with the conflicted nodeclaim to trigger cleanup of the conflicting instance. + if provisionMode.isAKSMachineMode() { + _, err = CreateAndWaitForPromises(ctx, cloudProvider, azureEnv, conflictedNodeClaim) + Expect(err).To(HaveOccurred()) + } else { + freshNodeClaim := &karpv1.NodeClaim{} + Expect(env.Client.Get(ctx, client.ObjectKey{Name: conflictedNodeClaim.Name}, freshNodeClaim)).To(Succeed()) + freshNodeClaim.StatusConditions().SetTrue(karpv1.ConditionTypeLaunched) + Expect(env.Client.Status().Update(ctx, freshNodeClaim)).To(Succeed()) + _, err = cloudProvider.Create(ctx, conflictedNodeClaim) + Expect(err).ToNot(HaveOccurred()) + cloudProvider.WaitForInstancePromises() + Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + conflictCreateInput := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + Expect(conflictCreateInput.VM.Properties).ToNot(BeNil()) + Expect(conflictCreateInput.VM.Properties.HardwareProfile).ToNot(BeNil()) + Expect(conflictCreateInput.VM.Properties.HardwareProfile.VMSize).ToNot(BeNil()) + Expect(string(lo.FromPtr(conflictCreateInput.VM.Properties.HardwareProfile.VMSize))).To(Equal(expectedConflictedVMSize)) + Expect(conflictCreateInput.VM.Zones).To(HaveLen(1)) + Expect(lo.FromPtr(conflictCreateInput.VM.Zones[0])).To(Equal("2")) + } - // Clear the error - azureEnv.AKSMachinesAPI.AKSMachineGetBehavior.Error.Set(nil) + // Verify cleanup was attempted after the conflict, then clear the injected get error. + if provisionMode.isAKSMachineMode() { + Expect(azureEnv.AKSAgentPoolsAPI.AgentPoolDeleteMachinesBehavior.CalledWithInput.Len()).To(BeNumerically(">=", 1)) + azureEnv.AKSMachinesAPI.AKSMachineGetBehavior.Error.Set(nil) + } else { + Expect(azureEnv.VirtualMachinesAPI.VirtualMachineDeleteBehavior.CalledWithInput.Len()).To(BeNumerically(">=", 1)) + deleteInput := azureEnv.VirtualMachinesAPI.VirtualMachineDeleteBehavior.CalledWithInput.Pop() + Expect(deleteInput.VMName).To(Equal(instance.GenerateResourceName(conflictedNodeClaim.Name))) + azureEnv.VirtualMachinesAPI.Instances.Delete(fake.MkVMID(deleteInput.ResourceGroupName, deleteInput.VMName)) + azureEnv.VirtualMachinesAPI.VirtualMachineGetBehavior.Error.Set(nil) + } - // Should succeed now that the conflicted node is gone from the cleanup - azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Reset() + if provisionMode.isAKSMachineMode() { + azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Reset() + } else { + azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Reset() + } + // Retry should succeed after the conflicting instance is gone. createdConflictedNodeClaim, err := CreateAndWaitForPromises(ctx, cloudProvider, azureEnv, conflictedNodeClaim) Expect(err).ToNot(HaveOccurred()) Expect(createdConflictedNodeClaim).ToNot(BeNil()) - // Verify the AKS machine was created successfully - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - aksMachine := createInput.AKSMachine - Expect(aksMachine.Properties).ToNot(BeNil()) - - // Validate AKS machine properties match the conflicted configuration - Expect(aksMachine.Properties.Hardware).ToNot(BeNil()) - Expect(aksMachine.Properties.Hardware.VMSize).ToNot(BeNil()) - Expect(*aksMachine.Properties.Hardware.VMSize).To(Equal("Standard_D2_v5")) - Expect(aksMachine.Zones).To(HaveLen(1)) - Expect(*aksMachine.Zones[0]).To(Equal("2")) - - // Validate the returned nodeClaim has correct configuration - validateAKSMachineNodeClaim(createdConflictedNodeClaim, nodePool) - Expect(createdConflictedNodeClaim.Labels[v1.LabelTopologyZone]).To(Equal(zones.MakeAKSLabelZoneFromARMZone(fake.Region, "2"))) - Expect(createdConflictedNodeClaim.Labels[v1.LabelInstanceTypeStable]).To(Equal("Standard_D2_v5")) + // Verify the instance was created successfully with the conflicted configuration. + if provisionMode.isAKSMachineMode() { + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + aksMachine := createInput.AKSMachine + Expect(aksMachine.Properties).ToNot(BeNil()) + Expect(aksMachine.Properties.Hardware).ToNot(BeNil()) + Expect(aksMachine.Properties.Hardware.VMSize).ToNot(BeNil()) + Expect(*aksMachine.Properties.Hardware.VMSize).To(Equal(expectedConflictedVMSize)) + Expect(aksMachine.Zones).To(HaveLen(1)) + Expect(*aksMachine.Zones[0]).To(Equal("2")) + Expect(createdConflictedNodeClaim.Labels[v1.LabelTopologyZone]).To(Equal(zones.MakeAKSLabelZoneFromARMZone(fake.Region, "2"))) + } else { + Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + createInput := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + Expect(createInput.VM.Properties).ToNot(BeNil()) + Expect(createInput.VM.Properties.HardwareProfile).ToNot(BeNil()) + Expect(createInput.VM.Properties.HardwareProfile.VMSize).ToNot(BeNil()) + Expect(string(lo.FromPtr(createInput.VM.Properties.HardwareProfile.VMSize))).To(Equal(expectedConflictedVMSize)) + Expect(createInput.VM.Zones).To(HaveLen(1)) + retriedZoneID := lo.FromPtr(createInput.VM.Zones[0]) + Expect(retriedZoneID).ToNot(BeEmpty()) + vm, ok := azureEnv.VirtualMachinesAPI.Instances.Load(fake.MkVMID(createInput.ResourceGroupName, createInput.VMName)) + Expect(ok).To(BeTrue(), "VM should exist in fake store") + Expect(string(lo.FromPtr(vm.Properties.HardwareProfile.VMSize))).To(Equal(expectedConflictedVMSize)) + Expect(createdConflictedNodeClaim.Labels[v1.LabelTopologyZone]).To(Equal(zones.MakeAKSLabelZoneFromARMZone(fake.Region, retriedZoneID))) + } + + // Validate the returned nodeclaim has the expected configuration. + provisionMode.validateNodeClaim(createdConflictedNodeClaim) + Expect(createdConflictedNodeClaim.Labels[v1.LabelInstanceTypeStable]).To(Equal(expectedConflictedVMSize)) Expect(createdConflictedNodeClaim.CreationTimestamp).ToNot(BeZero()) }) }) } var _ = Describe("CloudProvider", func() { + Context("ProvisionMode = AKSScriptless, ManageExistingAKSMachines = false", func() { + BeforeEach(func() { + testOptions = test.Options(test.OptionsFields{ + ProvisionMode: lo.ToPtr(consts.ProvisionModeAKSScriptless), + ManageExistingAKSMachines: lo.ToPtr(false), + }) + ctx = coreoptions.ToContext(ctx, coretest.Options()) + ctx = options.ToContext(ctx, testOptions) + + azureEnv = test.NewEnvironment(ctx, env) + test.ApplyDefaultStatus(nodeClass, env, testOptions.UseSIG) + cloudProvider = New(azureEnv.InstanceTypesProvider, azureEnv.VMInstanceProvider, azureEnv.AKSMachineProvider, recorder, env.Client, azureEnv.ImageProvider, azureEnv.InstanceTypeStore) + + cluster = state.NewCluster(fakeClock, env.Client, cloudProvider) + coreProvisioner = provisioning.NewProvisioner(env.Client, recorder, cloudProvider, cluster, fakeClock) + }) + + AfterEach(func() { + // Wait for any async polling goroutines to complete before resetting + cloudProvider.WaitForInstancePromises() + cluster.Reset() + azureEnv.Reset(ctx) + }) + + runIntegrationTests(aksscriptlessProvisionMode()) + }) + + Context("ProvisionMode = AKSScriptless, ManageExistingAKSMachines = true", func() { + BeforeEach(func() { + testOptions = test.Options(test.OptionsFields{ + ProvisionMode: lo.ToPtr(consts.ProvisionModeAKSScriptless), + ManageExistingAKSMachines: lo.ToPtr(true), + }) + ctx = coreoptions.ToContext(ctx, coretest.Options()) + ctx = options.ToContext(ctx, testOptions) + + azureEnv = test.NewEnvironment(ctx, env) + test.ApplyDefaultStatus(nodeClass, env, testOptions.UseSIG) + cloudProvider = New(azureEnv.InstanceTypesProvider, azureEnv.VMInstanceProvider, azureEnv.AKSMachineProvider, recorder, env.Client, azureEnv.ImageProvider, azureEnv.InstanceTypeStore) + + cluster = state.NewCluster(fakeClock, env.Client, cloudProvider) + coreProvisioner = provisioning.NewProvisioner(env.Client, recorder, cloudProvider, cluster, fakeClock) + }) + + AfterEach(func() { + // Wait for any async polling goroutines to complete before resetting + cloudProvider.WaitForInstancePromises() + cluster.Reset() + azureEnv.Reset(ctx) + }) + + runIntegrationTests(aksscriptlessProvisionMode()) + }) + Context("ProvisionMode = AKSMachineAPIHeaderBatch, ManageExistingAKSMachines = false", func() { BeforeEach(func() { testOptions = test.Options(test.OptionsFields{ @@ -593,9 +1041,7 @@ var _ = Describe("CloudProvider", func() { azureEnvNonZonal.Reset(ctx) }) - // Run shared provision-mode tests - runSharedProvisionModeIntegrationTests(aksMachineAPIHeaderBatchProvisionMode()) - runAKSMachineAPIIntegrationTests() + runIntegrationTests(aksMachineAPIHeaderBatchProvisionMode()) }) Context("ProvisionMode = AKSMachineAPIHeaderBatch, ManageExistingAKSMachines = true", func() { @@ -633,9 +1079,7 @@ var _ = Describe("CloudProvider", func() { azureEnvNonZonal.Reset(ctx) }) - // Run shared provision-mode tests - runSharedProvisionModeIntegrationTests(aksMachineAPIHeaderBatchProvisionMode()) - runAKSMachineAPIIntegrationTests() + runIntegrationTests(aksMachineAPIHeaderBatchProvisionMode()) }) Context("Mixed Environment - Migration from ProvisionMode = AKSMachineAPIHeaderBatch to VM mode", func() { diff --git a/pkg/cloudprovider/suite_modes_test.go b/pkg/cloudprovider/suite_modes_test.go deleted file mode 100644 index c0a4fa201..000000000 --- a/pkg/cloudprovider/suite_modes_test.go +++ /dev/null @@ -1,81 +0,0 @@ -/* -Portions Copyright (c) Microsoft Corporation. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package cloudprovider - -import ( - . "github.com/onsi/gomega" - karpv1 "sigs.k8s.io/karpenter/pkg/apis/v1" -) - -type provisionModeTestCase struct { - name string - validateNodeClaim func(*karpv1.NodeClaim) - resetCreateCalls func() - expectCreateCalls func() - expectCreatedResource func() - resetListCalls func() - expectListCalls func() - resetGetCalls func() - expectGetCalls func() - resetDeleteCalls func() - expectDeleteCalls func() -} - -func aksMachineAPIHeaderBatchProvisionMode() provisionModeTestCase { - return provisionModeTestCase{ - name: "AKSMachineAPIHeaderBatch", - validateNodeClaim: func(nodeClaim *karpv1.NodeClaim) { - validateAKSMachineNodeClaim(nodeClaim, nodePool) - }, - resetCreateCalls: func() { - azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Reset() - azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Reset() - }, - expectCreateCalls: func() { - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(0)) - }, - expectCreatedResource: func() { - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - Expect(createInput.AKSMachine.Properties).ToNot(BeNil()) - }, - resetListCalls: func() { - azureEnv.AKSMachinesAPI.AKSMachineNewListPagerBehavior.CalledWithInput.Reset() - azureEnv.AzureResourceGraphAPI.AzureResourceGraphResourcesBehavior.CalledWithInput.Reset() - }, - expectListCalls: func() { - Expect(azureEnv.AKSMachinesAPI.AKSMachineNewListPagerBehavior.CalledWithInput.Len()).To(Equal(1)) - Expect(azureEnv.AzureResourceGraphAPI.AzureResourceGraphResourcesBehavior.CalledWithInput.Len()).To(Equal(1)) - }, - resetGetCalls: func() { - azureEnv.AKSMachinesAPI.AKSMachineGetBehavior.CalledWithInput.Reset() - azureEnv.VirtualMachinesAPI.VirtualMachineGetBehavior.CalledWithInput.Reset() - }, - expectGetCalls: func() { - Expect(azureEnv.AKSMachinesAPI.AKSMachineGetBehavior.CalledWithInput.Len()).To(Equal(1)) - Expect(azureEnv.VirtualMachinesAPI.VirtualMachineGetBehavior.CalledWithInput.Len()).To(Equal(0)) - }, - resetDeleteCalls: func() { - azureEnv.AKSAgentPoolsAPI.AgentPoolDeleteMachinesBehavior.CalledWithInput.Reset() - azureEnv.VirtualMachinesAPI.VirtualMachineDeleteBehavior.CalledWithInput.Reset() - }, - expectDeleteCalls: func() { - Expect(azureEnv.AKSAgentPoolsAPI.AgentPoolDeleteMachinesBehavior.CalledWithInput.Len()).To(Equal(1)) - Expect(azureEnv.VirtualMachinesAPI.VirtualMachineDeleteBehavior.CalledWithInput.Len()).To(Equal(0)) - }, - } -} diff --git a/pkg/cloudprovider/suite_offerings_test.go b/pkg/cloudprovider/suite_offerings_test.go index 4c58eda2a..51dd19750 100644 --- a/pkg/cloudprovider/suite_offerings_test.go +++ b/pkg/cloudprovider/suite_offerings_test.go @@ -18,9 +18,10 @@ package cloudprovider import ( "fmt" + "net/http" + sdkerrors "github.com/Azure/azure-sdk-for-go-extensions/pkg/errors" "github.com/awslabs/operatorpkg/object" - corestatus "github.com/awslabs/operatorpkg/status" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" "github.com/samber/lo" @@ -53,11 +54,10 @@ import ( "github.com/Azure/skewer" ) -func runAKSMachineAPIOfferingTests() { +func runOfferingTests(provisionMode provisionModeTestCase) { Context("Create - Expected Creation Failures", func() { - // Ported from VM test: "should fail to provision when LowPriorityCoresQuota errors are hit, then switch capacity type and succeed" It("should fail to provision when LowPriorityCoresQuota errors are hit, then switch capacity type and succeed", func() { - // Configure NodePool to allow both spot and on-demand + lowPriorityCoresQuotaErrorMessage := "Operation could not be completed as it results in exceeding approved Low Priority Cores quota. Additional details - Deployment Model: Resource Manager, Location: westus2, Current Limit: 0, Current Usage: 0, Additional Required: 32, (Minimum) New Limit Required: 32. Submit a request for Quota increase at https://aka.ms/ProdportalCRP/#blade/Microsoft_Azure_Capacity/UsageAndQuota.ReactView/Parameters/%7B%22subscriptionId%22:%(redacted)%22,%22command%22:%22openQuotaApprovalBlade%22,%22quotas%22:[%7B%22location%22:%22westus2%22,%22providerId%22:%22Microsoft.Compute%22,%22resourceName%22:%22LowPriorityCores%22,%22quotaRequest%22:%7B%22properties%22:%7B%22limit%22:32,%22unit%22:%22Count%22,%22name%22:%7B%22value%22:%22LowPriorityCores%22%7D%7D%7D%7D]%7D by specifying parameters listed in the ‘Details’ section for deployment to succeed. Please read more about quota limits at https://docs.microsoft.com/en-us/azure/azure-supportability/per-vm-quota-requests" coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ Key: karpv1.CapacityTypeLabelKey, Operator: v1.NodeSelectorOpIn, @@ -65,72 +65,108 @@ func runAKSMachineAPIOfferingTests() { }) ExpectApplied(ctx, env.Client, nodePool, nodeClass) - // Set up async error - azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorLowPriorityCoresQuota(fake.Region) + if provisionMode.isAKSMachineMode() { + azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorLowPriorityCoresQuota(fake.Region) + } else { + azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set( + &azcore.ResponseError{ + ErrorCode: sdkerrors.OperationNotAllowed, + RawResponse: &http.Response{ + Body: createSDKErrorBody(sdkerrors.OperationNotAllowed, lowPriorityCoresQuotaErrorMessage), + }, + }, + ) + } pod := coretest.UnschedulablePod() ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) ExpectNotScheduled(ctx, env.Client, pod) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(BeNumerically(">=", 1)) - - // Verify spot capacity type marked as unavailable due to quota error - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - vmSize := lo.FromPtr(createInput.AKSMachine.Properties.Hardware.VMSize) - Expect(*createInput.AKSMachine.Properties.Priority).To(Equal(armcontainerservice.ScaleSetPrioritySpot)) - testSKU := fake.MakeSKU(vmSize) - zone, err := instance.GetAKSLabelZoneFromAKSMachine(&createInput.AKSMachine, fake.Region) - Expect(err).ToNot(HaveOccurred()) - ExpectUnavailable(azureEnv, testSKU, zone, karpv1.CapacityTypeSpot) - // Clear both error and output for retry - should succeed with on-demand - azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = nil + if provisionMode.isAKSMachineMode() { + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(BeNumerically(">=", 1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + vmSize := lo.FromPtr(createInput.AKSMachine.Properties.Hardware.VMSize) + Expect(*createInput.AKSMachine.Properties.Priority).To(Equal(armcontainerservice.ScaleSetPrioritySpot)) + testSKU := fake.MakeSKU(vmSize) + zone, err := instance.GetAKSLabelZoneFromAKSMachine(&createInput.AKSMachine, fake.Region) + Expect(err).ToNot(HaveOccurred()) + ExpectUnavailable(azureEnv, testSKU, zone, karpv1.CapacityTypeSpot) + } + + if provisionMode.isAKSMachineMode() { + azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = nil + } else { + azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set(nil) + } ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) node := ExpectScheduled(ctx, env.Client, pod) Expect(node.Labels[karpv1.CapacityTypeLabelKey]).To(Equal(karpv1.CapacityTypeOnDemand)) - // Verify final node count nodes, err := env.KubernetesInterface.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) Expect(err).ToNot(HaveOccurred()) Expect(len(nodes.Items)).To(Equal(1)) Expect(nodes.Items[0].Labels[karpv1.CapacityTypeLabelKey]).To(Equal(karpv1.CapacityTypeOnDemand)) }) - // Ported from VM test: "should fail to provision when OverconstrainedZonalAllocation errors are hit, then switch zone and succeed" It("should fail to provision when OverconstrainedZonalAllocation errors are hit, then switch zone and succeed", func() { + overconstrainedZonalAllocationErrorMessage := "Allocation failed. VM(s) with the following constraints cannot be allocated, because the condition is too restrictive. Please remove some constraints and try again." coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ Key: karpv1.CapacityTypeLabelKey, Operator: v1.NodeSelectorOpIn, - Values: []string{karpv1.CapacityTypeOnDemand, karpv1.CapacityTypeSpot}}) + Values: []string{karpv1.CapacityTypeOnDemand, karpv1.CapacityTypeSpot}, + }) ExpectApplied(ctx, env.Client, nodePool, nodeClass) - // Set up async error via BOTH Error and Output (LRO returns both) - azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorOverconstrainedZonalAllocation() + if provisionMode.isAKSMachineMode() { + azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorOverconstrainedZonalAllocation() + } else { + azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set( + &azcore.ResponseError{ + ErrorCode: sdkerrors.OverconstrainedZonalAllocationRequest, + RawResponse: &http.Response{ + Body: createSDKErrorBody(sdkerrors.OverconstrainedZonalAllocationRequest, overconstrainedZonalAllocationErrorMessage), + }, + }, + ) + } pod := coretest.UnschedulablePod() ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) ExpectNotScheduled(ctx, env.Client, pod) - // Verify the create API was called but failed due to zonal allocation constraint - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(BeNumerically(">=", 1)) - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - initialZone, err := instance.GetAKSLabelZoneFromAKSMachine(&createInput.AKSMachine, fake.Region) - Expect(err).ToNot(HaveOccurred()) + if provisionMode.isAKSMachineMode() { + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(BeNumerically(">=", 1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + initialZone, err := instance.GetAKSLabelZoneFromAKSMachine(&createInput.AKSMachine, fake.Region) + Expect(err).ToNot(HaveOccurred()) + vmSize := lo.FromPtr(createInput.AKSMachine.Properties.Hardware.VMSize) + testSKU := fake.MakeSKU(vmSize) + ExpectUnavailable(azureEnv, testSKU, initialZone, karpv1.CapacityTypeSpot) - // Verify initial zone marked as unavailable due to zonal allocation failure - vmSize := lo.FromPtr(createInput.AKSMachine.Properties.Hardware.VMSize) - testSKU := fake.MakeSKU(vmSize) - ExpectUnavailable(azureEnv, testSKU, initialZone, karpv1.CapacityTypeSpot) + azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = nil + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + node := ExpectScheduled(ctx, env.Client, pod) + Expect(node.Labels[v1.LabelTopologyZone]).ToNot(Equal(initialZone)) + } else { + vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM + initialVMSize := string(*vm.Properties.HardwareProfile.VMSize) + initialCapacityType := instance.GetCapacityTypeFromVM(&vm) + zone, err := zones.MakeAKSLabelZoneFromVM(&vm) + Expect(err).ToNot(HaveOccurred()) + ExpectUnavailable(azureEnv, fake.MakeSKU(initialVMSize), zone, karpv1.CapacityTypeSpot) - // Clear the error and retry - should succeed with different zone - azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = nil - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - node := ExpectScheduled(ctx, env.Client, pod) - Expect(node.Labels[v1.LabelTopologyZone]).ToNot(Equal(initialZone)) + azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set(nil) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + node := ExpectScheduled(ctx, env.Client, pod) + Expect(node.Labels[v1.LabelInstanceTypeStable]).To(Equal(initialVMSize)) + Expect(node.Labels[karpv1.CapacityTypeLabelKey]).To(Equal(initialCapacityType)) + Expect(node.Labels[v1.LabelTopologyZone]).ToNot(Equal(zone)) + Expect(node.Labels).To(HaveKeyWithValue(v1beta1.LabelPlacementScope, v1beta1.PlacementScopeZonal)) + } }) - // Ported from VM test: "should fail to provision when OverconstrainedAllocation errors are hit, then switch capacity type and succeed" It("should fail to provision when OverconstrainedAllocation errors are hit, then switch capacity type and succeed", func() { - // Configure NodePool to allow multiple capacity types + overconstrainedAllocationErrorMessage := "Allocation failed. VM(s) with the following constraints cannot be allocated, because the condition is too restrictive." coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ Key: karpv1.CapacityTypeLabelKey, @@ -145,36 +181,55 @@ func runAKSMachineAPIOfferingTests() { ) ExpectApplied(ctx, env.Client, nodePool, nodeClass) - // Set up async error via BOTH Error and Output (LRO returns both) - azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorOverconstrainedAllocation() + if provisionMode.isAKSMachineMode() { + azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorOverconstrainedAllocation() + } else { + azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set( + &azcore.ResponseError{ + ErrorCode: sdkerrors.OverconstrainedAllocationRequest, + RawResponse: &http.Response{ + Body: createSDKErrorBody(sdkerrors.OverconstrainedAllocationRequest, overconstrainedAllocationErrorMessage), + }, + }, + ) + } pod := coretest.UnschedulablePod() ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) ExpectNotScheduled(ctx, env.Client, pod) - // Verify the create API was called but failed due to overconstrained allocation - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(BeNumerically(">=", 1)) + if provisionMode.isAKSMachineMode() { + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(BeNumerically(">=", 1)) + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + vmSize := lo.FromPtr(createInput.AKSMachine.Properties.Hardware.VMSize) + testSKU := fake.MakeSKU(vmSize) + zone, err := instance.GetAKSLabelZoneFromAKSMachine(&createInput.AKSMachine, fake.Region) + Expect(err).ToNot(HaveOccurred()) + ExpectUnavailable(azureEnv, testSKU, zone, karpv1.CapacityTypeSpot) - // Verify spot capacity type marked as unavailable due to allocation error - createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - vmSize := lo.FromPtr(createInput.AKSMachine.Properties.Hardware.VMSize) - testSKU := fake.MakeSKU(vmSize) - zone, err := instance.GetAKSLabelZoneFromAKSMachine(&createInput.AKSMachine, fake.Region) - Expect(err).ToNot(HaveOccurred()) - ExpectUnavailable(azureEnv, testSKU, zone, karpv1.CapacityTypeSpot) + azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = nil + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + node := ExpectScheduled(ctx, env.Client, pod) + Expect(node.Labels[karpv1.CapacityTypeLabelKey]).To(Equal(karpv1.CapacityTypeOnDemand)) + Expect(node.Labels).To(HaveKeyWithValue(v1beta1.LabelPlacementScope, v1beta1.PlacementScopeZonal)) + } else { + vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM + initialVMSize := string(*vm.Properties.HardwareProfile.VMSize) + initialCapacityType := instance.GetCapacityTypeFromVM(&vm) + _, err := zones.MakeAKSLabelZoneFromVM(&vm) + Expect(err).ToNot(HaveOccurred()) - // Clear both error and output for retry - should succeed with on-demand because - // this test constrains the NodePool to zonal placement. - azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = nil - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - node := ExpectScheduled(ctx, env.Client, pod) - Expect(node.Labels[karpv1.CapacityTypeLabelKey]).To(Equal(karpv1.CapacityTypeOnDemand)) - Expect(node.Labels).To(HaveKeyWithValue(v1beta1.LabelPlacementScope, v1beta1.PlacementScopeZonal)) + azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set(nil) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + node := ExpectScheduled(ctx, env.Client, pod) + Expect(node.Labels[v1.LabelInstanceTypeStable]).To(Equal(initialVMSize)) + Expect(node.Labels[karpv1.CapacityTypeLabelKey]).ToNot(Equal(initialCapacityType)) + Expect(node.Labels[karpv1.CapacityTypeLabelKey]).To(Equal(karpv1.CapacityTypeOnDemand)) + Expect(node.Labels).To(HaveKeyWithValue(v1beta1.LabelPlacementScope, v1beta1.PlacementScopeZonal)) + } }) - // Ported from VM test: "should fail to provision when AllocationFailure errors are hit, then switch placement and succeed" It("should fail to provision when AllocationFailure errors are hit, then switch placement and succeed", func() { - // Configure NodePool to allow multiple instance types coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ Key: v1.LabelInstanceTypeStable, Operator: v1.NodeSelectorOpIn, @@ -182,32 +237,51 @@ func runAKSMachineAPIOfferingTests() { }) ExpectApplied(ctx, env.Client, nodePool, nodeClass) - // Set up async error via BOTH Error and Output (LRO returns both) - azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorAllocationFailed() + if provisionMode.isAKSMachineMode() { + azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorAllocationFailed() + } else { + azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set( + &azcore.ResponseError{ + ErrorCode: sdkerrors.AllocationFailed, + RawResponse: &http.Response{ + Body: createSDKErrorBody(sdkerrors.AllocationFailed, "Allocation failed. We do not have sufficient capacity for the requested VM size in this region."), + }, + }, + ) + } pod := coretest.UnschedulablePod() ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) ExpectNotScheduled(ctx, env.Client, pod) - // Verify the create API was called but failed due to allocation failure - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(BeNumerically(">=", 1)) - aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine - initialVMSize := lo.FromPtr(aksMachine.Properties.Hardware.VMSize) + if provisionMode.isAKSMachineMode() { + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(BeNumerically(">=", 1)) + aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine + initialVMSize := lo.FromPtr(aksMachine.Properties.Hardware.VMSize) + zone, err := instance.GetAKSLabelZoneFromAKSMachine(&aksMachine, fake.Region) + Expect(err).ToNot(HaveOccurred()) + ExpectUnavailable(azureEnv, fake.MakeSKU(initialVMSize), zone, karpv1.CapacityTypeSpot) - // Verify initial VM size marked as unavailable due to allocation failure - zone, err := instance.GetAKSLabelZoneFromAKSMachine(&aksMachine, fake.Region) - Expect(err).ToNot(HaveOccurred()) - ExpectUnavailable(azureEnv, fake.MakeSKU(initialVMSize), zone, karpv1.CapacityTypeSpot) + azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = nil + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + node := ExpectScheduled(ctx, env.Client, pod) + Expect(node.Labels[v1.LabelInstanceTypeStable]).To(Equal(initialVMSize)) + Expect(node.Labels[v1.LabelTopologyZone]).To(Equal(zones.Regional)) + } else { + vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM + initialVMSize := string(*vm.Properties.HardwareProfile.VMSize) + zone, err := zones.MakeAKSLabelZoneFromVM(&vm) + Expect(err).ToNot(HaveOccurred()) + ExpectUnavailable(azureEnv, fake.MakeSKU(initialVMSize), zone, karpv1.CapacityTypeSpot) - // Clear the error and retry - should succeed with the same VM size placed regionally - azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = nil - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - node := ExpectScheduled(ctx, env.Client, pod) - Expect(node.Labels[v1.LabelInstanceTypeStable]).To(Equal(initialVMSize)) - Expect(node.Labels[v1.LabelTopologyZone]).To(Equal(zones.Regional)) + azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set(nil) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + node := ExpectScheduled(ctx, env.Client, pod) + Expect(node.Labels[v1.LabelInstanceTypeStable]).To(Equal(initialVMSize)) + Expect(node.Labels[v1.LabelTopologyZone]).To(Equal(zones.Regional)) + } }) - // Ported from VM test: "should fail to provision when AllocationFailure errors are hit and regional placement is unavailable" It("should fail to provision when AllocationFailure errors are hit and regional placement is unavailable", func() { coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ Key: v1.LabelInstanceTypeStable, @@ -219,107 +293,147 @@ func runAKSMachineAPIOfferingTests() { azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "RegionalUnavailable", sku, zones.Regional, karpv1.CapacityTypeOnDemand) ExpectApplied(ctx, env.Client, nodePool, nodeClass) - azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorAllocationFailed() + if provisionMode.isAKSMachineMode() { + azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorAllocationFailed() + } else { + azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set( + &azcore.ResponseError{ + ErrorCode: sdkerrors.AllocationFailed, + RawResponse: &http.Response{ + Body: createSDKErrorBody(sdkerrors.AllocationFailed, "Allocation failed. We do not have sufficient capacity for the requested VM size in this region."), + }, + }, + ) + } pod := coretest.UnschedulablePod() ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) ExpectNotScheduled(ctx, env.Client, pod) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(BeNumerically(">=", 1)) - aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine - zone, err := instance.GetAKSLabelZoneFromAKSMachine(&aksMachine, fake.Region) - Expect(err).ToNot(HaveOccurred()) - ExpectUnavailable(azureEnv, sku, zone, karpv1.CapacityTypeSpot) - ExpectUnavailable(azureEnv, sku, zone, karpv1.CapacityTypeOnDemand) - ExpectUnavailable(azureEnv, sku, zones.Regional, karpv1.CapacityTypeSpot) - ExpectUnavailable(azureEnv, sku, zones.Regional, karpv1.CapacityTypeOnDemand) - - azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = nil - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectNotScheduled(ctx, env.Client, pod) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(0)) - }) - - It("should fail to provision when AllocationFailure errors are hit and all placements for the VM size are unavailable, then switch VM size and succeed", func() { - coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1.LabelInstanceTypeStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{"Standard_D2_v3", "Standard_D64s_v3"}, - }) - sku := fake.MakeSKU("Standard_D2_v3") - azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "RegionalUnavailable", sku, zones.Regional, karpv1.CapacityTypeSpot) - azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "RegionalUnavailable", sku, zones.Regional, karpv1.CapacityTypeOnDemand) - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - - azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorAllocationFailed() - - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectNotScheduled(ctx, env.Client, pod) + if provisionMode.isAKSMachineMode() { + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(BeNumerically(">=", 1)) + aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine + zone, err := instance.GetAKSLabelZoneFromAKSMachine(&aksMachine, fake.Region) + Expect(err).ToNot(HaveOccurred()) + ExpectUnavailable(azureEnv, sku, zone, karpv1.CapacityTypeSpot) + ExpectUnavailable(azureEnv, sku, zone, karpv1.CapacityTypeOnDemand) + ExpectUnavailable(azureEnv, sku, zones.Regional, karpv1.CapacityTypeSpot) + ExpectUnavailable(azureEnv, sku, zones.Regional, karpv1.CapacityTypeOnDemand) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(BeNumerically(">=", 1)) - aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine - initialVMSize := lo.FromPtr(aksMachine.Properties.Hardware.VMSize) - zone, err := instance.GetAKSLabelZoneFromAKSMachine(&aksMachine, fake.Region) - Expect(err).ToNot(HaveOccurred()) - ExpectUnavailable(azureEnv, fake.MakeSKU(initialVMSize), zone, karpv1.CapacityTypeSpot) - ExpectUnavailable(azureEnv, fake.MakeSKU(initialVMSize), zone, karpv1.CapacityTypeOnDemand) - ExpectUnavailable(azureEnv, fake.MakeSKU(initialVMSize), zones.Regional, karpv1.CapacityTypeSpot) - ExpectUnavailable(azureEnv, fake.MakeSKU(initialVMSize), zones.Regional, karpv1.CapacityTypeOnDemand) + azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = nil + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectNotScheduled(ctx, env.Client, pod) + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(0)) + } else { + vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM + zone, err := zones.MakeAKSLabelZoneFromVM(&vm) + Expect(err).ToNot(HaveOccurred()) + ExpectUnavailable(azureEnv, sku, zone, karpv1.CapacityTypeSpot) + ExpectUnavailable(azureEnv, sku, zone, karpv1.CapacityTypeOnDemand) + ExpectUnavailable(azureEnv, sku, zones.Regional, karpv1.CapacityTypeSpot) + ExpectUnavailable(azureEnv, sku, zones.Regional, karpv1.CapacityTypeOnDemand) - azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = nil - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - node := ExpectScheduled(ctx, env.Client, pod) - Expect(node.Labels[v1.LabelInstanceTypeStable]).ToNot(Equal(initialVMSize)) + azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set(nil) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectNotScheduled(ctx, env.Client, pod) + Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(0)) + } }) - // Ported from VM test: "should fail to provision when VM SKU family vCPU quota exceeded error is returned, and succeed when it is gone" It("should fail to provision when VM SKU family vCPU quota exceeded error is returned, and succeed when it is gone", func() { + familyVCPUQuotaExceededErrorMessage := "Operation could not be completed as it results in exceeding approved standardDLSv5Family Cores quota. Additional details - Deployment Model: Resource Manager, Location: westus2, Current Limit: 100, Current Usage: 96, Additional Required: 32, (Minimum) New Limit Required: 128. Submit a request for Quota increase at https://aka.ms/ProdportalCRP/#blade/Microsoft_Azure_Capacity/UsageAndQuota.ReactView/Parameters/%7B%22subscriptionId%22:%(redacted)%22,%22command%22:%22openQuotaApprovalBlade%22,%22quotas%22:[%7B%22location%22:%22westus2%22,%22providerId%22:%22Microsoft.Compute%22,%22resourceName%22:%22standardDLSv5Family%22,%22quotaRequest%22:%7B%22properties%22:%7B%22limit%22:128,%22unit%22:%22Count%22,%22name%22:%7B%22value%22:%22standardDLSv5Family%22%7D%7D%7D%7D]%7D by specifying parameters listed in the ‘Details’ section for deployment to succeed. Please read more about quota limits at https://docs.microsoft.com/en-us/azure/azure-supportability/per-vm-quota-requests" ExpectApplied(ctx, env.Client, nodePool, nodeClass) - // Set up async error via BOTH Error and Output (LRO returns both) - azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorVMFamilyQuotaExceeded("westus2", "Standard NCASv3_T4", 24, 24, 8, 32) + if provisionMode.isAKSMachineMode() { + azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorVMFamilyQuotaExceeded("westus2", "Standard NCASv3_T4", 24, 24, 8, 32) + } else { + azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set( + &azcore.ResponseError{ + ErrorCode: sdkerrors.OperationNotAllowed, + RawResponse: &http.Response{ + Body: createSDKErrorBody(sdkerrors.OperationNotAllowed, familyVCPUQuotaExceededErrorMessage), + }, + }, + ) + } pod := coretest.UnschedulablePod() ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) ExpectNotScheduled(ctx, env.Client, pod) - // Verify the create API was called but failed due to family quota - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(BeNumerically(">=", 1)) + if provisionMode.isAKSMachineMode() { + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(BeNumerically(">=", 1)) + azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = nil + } else { + Expect(azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + nic := azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Pop() + Expect(nic).NotTo(BeNil()) + _, ok := azureEnv.NetworkInterfacesAPI.NetworkInterfaces.Load(lo.FromPtr(nic.Interface.ID)) + Expect(ok).To(Equal(false)) + + azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set(nil) + pod = coretest.UnschedulablePod() + } - // Clear the error and retry - should succeed - azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = nil ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) ExpectScheduled(ctx, env.Client, pod) }) - // Ported from VM test: "should fail to provision when VM SKU family vCPU quota limit is zero, and succeed when its gone" It("should fail to provision when VM SKU family vCPU quota limit is zero, and succeed when its gone", func() { + familyVCPUQuotaIsZeroErrorMessage := "Operation could not be completed as it results in exceeding approved standardDLSv5Family Cores quota. Additional details - Deployment Model: Resource Manager, Location: westus2, Current Limit: 0, Current Usage: 0, Additional Required: 32, (Minimum) New Limit Required: 32. Submit a request for Quota increase at https://aka.ms/ProdportalCRP/#blade/Microsoft_Azure_Capacity/UsageAndQuota.ReactView/Parameters/%7B%22subscriptionId%22:%(redacted)%22,%22command%22:%22openQuotaApprovalBlade%22,%22quotas%22:[%7B%22location%22:%22westus2%22,%22providerId%22:%22Microsoft.Compute%22,%22resourceName%22:%22standardDLSv5Family%22,%22quotaRequest%22:%7B%22properties%22:%7B%22limit%22:128,%22unit%22:%22Count%22,%22name%22:%7B%22value%22:%22standardDLSv5Family%22%7D%7D%7D%7D]%7D by specifying parameters listed in the ‘Details’ section for deployment to succeed. Please read more about quota limits at https://docs.microsoft.com/en-us/azure/azure-supportability/per-vm-quota-requests" ExpectApplied(ctx, env.Client, nodePool, nodeClass) - // Set up async error via BOTH Error and Output (LRO returns both) - azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorVMFamilyQuotaExceeded("westus2", "Standard NCASv3_T4", 0, 0, 8, 8) + if provisionMode.isAKSMachineMode() { + azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorVMFamilyQuotaExceeded("westus2", "Standard NCASv3_T4", 0, 0, 8, 8) + } else { + azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set( + &azcore.ResponseError{ + ErrorCode: sdkerrors.OperationNotAllowed, + RawResponse: &http.Response{ + Body: createSDKErrorBody(sdkerrors.OperationNotAllowed, familyVCPUQuotaIsZeroErrorMessage), + }, + }, + ) + } pod := coretest.UnschedulablePod() ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) ExpectNotScheduled(ctx, env.Client, pod) - // Verify the create API was called but failed due to zero quota limit - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(BeNumerically(">=", 1)) + if provisionMode.isAKSMachineMode() { + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(BeNumerically(">=", 1)) + azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = nil + } else { + Expect(azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + nic := azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Pop() + Expect(nic).NotTo(BeNil()) + _, ok := azureEnv.NetworkInterfacesAPI.NetworkInterfaces.Load(lo.FromPtr(nic.Interface.ID)) + Expect(ok).To(Equal(false)) + + azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set(nil) + pod = coretest.UnschedulablePod() + } - // Clear the error and retry - should succeed - azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = nil ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) ExpectScheduled(ctx, env.Client, pod) }) - // Ported from VM test: Total Regional Cores quota test pattern It("should return ICE if Total Regional Cores Quota errors are hit", func() { - // Set up async error via BOTH Error and Output (LRO returns both) - azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorTotalRegionalCoresQuota(fake.Region) + regionalVCPUQuotaExceededErrorMessage := "Operation could not be completed as it results in exceeding approved Total Regional Cores quota. Additional details - Deployment Model: Resource Manager, Location: uksouth, Current Limit: 100, Current Usage: 100, Additional Required: 64, (Minimum) New Limit Required: 164. Submit a request for Quota increase at https://aka.ms/ProdportalCRP/#blade/Microsoft_Azure_Capacity/UsageAndQuota.ReactView/Parameters/%7B%22subscriptionId%22:%(redacted)%22,%22command%22:%22openQuotaApprovalBlade%22,%22quotas%22:[%7B%22location%22:%22uksouth%22,%22providerId%22:%22Microsoft.Compute%22,%22resourceName%22:%22cores%22,%22quotaRequest%22:%7B%22properties%22:%7B%22limit%22:164,%22unit%22:%22Count%22,%22name%22:%7B%22value%22:%22cores%22%7D%7D%7D%7D]%7D by specifying parameters listed in the ‘Details’ section for deployment to succeed. Please read more about quota limits at https://docs.microsoft.com/en-us/azure/azure-supportability/regional-quota-requests" + if provisionMode.isAKSMachineMode() { + azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorTotalRegionalCoresQuota(fake.Region) + } else { + azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set( + &azcore.ResponseError{ + ErrorCode: sdkerrors.OperationNotAllowed, + RawResponse: &http.Response{ + Body: createSDKErrorBody(sdkerrors.OperationNotAllowed, regionalVCPUQuotaExceededErrorMessage), + }, + }, + ) + } - // Create nodeClaim directly and call cloudProvider.Create like VM tests - testNodeClaim1 := coretest.NodeClaim(karpv1.NodeClaim{ + testNodeClaim := coretest.NodeClaim(karpv1.NodeClaim{ ObjectMeta: metav1.ObjectMeta{ Labels: map[string]string{ karpv1.NodePoolLabelKey: nodePool.Name, @@ -333,19 +447,138 @@ func runAKSMachineAPIOfferingTests() { }, }, }) + if provisionMode.isAKSMachineMode() { + ExpectApplied(ctx, env.Client, nodePool, nodeClass, testNodeClaim) + } else { + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + } - ExpectApplied(ctx, env.Client, nodePool, nodeClass, testNodeClaim1) - claim, err := CreateAndWaitForPromises(ctx, cloudProvider, azureEnv, testNodeClaim1) + claim, err := CreateAndWaitForPromises(ctx, cloudProvider, azureEnv, testNodeClaim) Expect(corecloudprovider.IsInsufficientCapacityError(err)).To(BeTrue()) Expect(claim).To(BeNil()) }) + + It("should fail to provision when AllocationFailure errors are hit and all placements for the VM size are unavailable, then switch VM size and succeed", func() { + coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ + Key: v1.LabelInstanceTypeStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{"Standard_D2_v3", "Standard_D64s_v3"}, + }) + sku := fake.MakeSKU("Standard_D2_v3") + azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "RegionalUnavailable", sku, zones.Regional, karpv1.CapacityTypeSpot) + azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "RegionalUnavailable", sku, zones.Regional, karpv1.CapacityTypeOnDemand) + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + + if provisionMode.isAKSMachineMode() { + azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorAllocationFailed() + } else { + azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set( + &azcore.ResponseError{ + ErrorCode: sdkerrors.AllocationFailed, + RawResponse: &http.Response{ + Body: createSDKErrorBody(sdkerrors.AllocationFailed, "Allocation failed. We do not have sufficient capacity for the requested VM size in this region."), + }, + }, + ) + } + + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectNotScheduled(ctx, env.Client, pod) + + var initialVMSize string + var zone string + var err error + if provisionMode.isAKSMachineMode() { + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(BeNumerically(">=", 1)) + aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine + initialVMSize = lo.FromPtr(aksMachine.Properties.Hardware.VMSize) + zone, err = instance.GetAKSLabelZoneFromAKSMachine(&aksMachine, fake.Region) + Expect(err).ToNot(HaveOccurred()) + azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = nil + } else { + vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM + initialVMSize = string(*vm.Properties.HardwareProfile.VMSize) + zone, err = zones.MakeAKSLabelZoneFromVM(&vm) + Expect(err).ToNot(HaveOccurred()) + azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set(nil) + } + ExpectUnavailable(azureEnv, fake.MakeSKU(initialVMSize), zone, karpv1.CapacityTypeSpot) + ExpectUnavailable(azureEnv, fake.MakeSKU(initialVMSize), zone, karpv1.CapacityTypeOnDemand) + ExpectUnavailable(azureEnv, fake.MakeSKU(initialVMSize), zones.Regional, karpv1.CapacityTypeSpot) + ExpectUnavailable(azureEnv, fake.MakeSKU(initialVMSize), zones.Regional, karpv1.CapacityTypeOnDemand) + + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + node := ExpectScheduled(ctx, env.Client, pod) + Expect(node.Labels[v1.LabelInstanceTypeStable]).ToNot(Equal(initialVMSize)) + }) }) - // Ported from VM test: "Zone-aware provisioning" Context("Create - Zone-aware provisioning", func() { - // Ported from VM test: "should launch in the NodePool-requested zone" + It("should prefer zonal placement for zone-capable instance types by default", func() { + coretest.ReplaceRequirements(nodePool, + karpv1.NodeSelectorRequirementWithMinValues{ + Key: v1.LabelInstanceTypeStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{"Standard_NC24ads_A100_v4"}}, + karpv1.NodeSelectorRequirementWithMinValues{ + Key: karpv1.CapacityTypeLabelKey, + Operator: v1.NodeSelectorOpIn, + Values: []string{karpv1.CapacityTypeOnDemand}}, + ) + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + + node := ExpectScheduled(ctx, env.Client, pod) + Expect(node.Labels).To(HaveKeyWithValue(v1beta1.LabelPlacementScope, v1beta1.PlacementScopeZonal)) + Expect(node.Labels[v1.LabelTopologyZone]).ToNot(Equal(zones.Regional)) + + if provisionMode.isAKSMachineMode() { + aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine + Expect(aksMachine.Zones).ToNot(BeEmpty()) + } else { + vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM + Expect(vm.Zones).ToNot(BeEmpty()) + } + }) + + It("should launch zone-capable instance types regionally when placement scope requires it", func() { + coretest.ReplaceRequirements(nodePool, + karpv1.NodeSelectorRequirementWithMinValues{ + Key: v1.LabelInstanceTypeStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{"Standard_NC24ads_A100_v4"}}, + karpv1.NodeSelectorRequirementWithMinValues{ + Key: karpv1.CapacityTypeLabelKey, + Operator: v1.NodeSelectorOpIn, + Values: []string{karpv1.CapacityTypeOnDemand}}, + karpv1.NodeSelectorRequirementWithMinValues{ + Key: v1beta1.LabelPlacementScope, + Operator: v1.NodeSelectorOpIn, + Values: []string{v1beta1.PlacementScopeRegional}}, + ) + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + + node := ExpectScheduled(ctx, env.Client, pod) + Expect(node.Labels).To(HaveKeyWithValue(v1.LabelTopologyZone, zones.Regional)) + Expect(node.Labels).To(HaveKeyWithValue(v1beta1.LabelPlacementScope, v1beta1.PlacementScopeRegional)) + + if provisionMode.isAKSMachineMode() { + aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine + Expect(aksMachine.Zones).To(BeEmpty()) + } else { + vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM + Expect(vm.Zones).To(BeEmpty()) + } + }) + It("should launch in the NodePool-requested zone", func() { - zone, aksMachineZone := fmt.Sprintf("%s-3", fake.Region), "3" + zone, createZone := fmt.Sprintf("%s-3", fake.Region), "3" nodePool.Spec.Template.Spec.Requirements = []karpv1.NodeSelectorRequirementWithMinValues{ {Key: karpv1.CapacityTypeLabelKey, Operator: v1.NodeSelectorOpIn, Values: []string{karpv1.CapacityTypeSpot, karpv1.CapacityTypeOnDemand}}, {Key: v1.LabelTopologyZone, Operator: v1.NodeSelectorOpIn, Values: []string{zone}}, @@ -356,30 +589,40 @@ func runAKSMachineAPIOfferingTests() { node := ExpectScheduled(ctx, env.Client, pod) Expect(node.Labels).To(HaveKeyWithValue(v1.LabelTopologyZone, zone)) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine - Expect(aksMachine).NotTo(BeNil()) - Expect(aksMachine.Zones).To(ConsistOf(&aksMachineZone)) + if provisionMode.isAKSMachineMode() { + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine + Expect(aksMachine).NotTo(BeNil()) + Expect(aksMachine.Zones).To(ConsistOf(&createZone)) + } else { + vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM + Expect(vm).NotTo(BeNil()) + Expect(vm.Zones).To(ConsistOf(&createZone)) + } }) - // Ported from VM test: "should support provisioning in non-zonal regions" It("should support provisioning in non-zonal regions", func() { ExpectApplied(ctx, env.Client, nodePool, nodeClass) pod := coretest.UnschedulablePod() ExpectProvisionedAndWaitForPromises(ctx, env.Client, clusterNonZonal, cloudProviderNonZonal, coreProvisionerNonZonal, azureEnvNonZonal, pod) ExpectScheduled(ctx, env.Client, pod) - Expect(azureEnvNonZonal.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - aksMachine := azureEnvNonZonal.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine - Expect(aksMachine.Zones).To(BeEmpty()) + if provisionMode.isAKSMachineMode() { + Expect(azureEnvNonZonal.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + aksMachine := azureEnvNonZonal.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine + Expect(aksMachine.Zones).To(BeEmpty()) + } else { + Expect(azureEnvNonZonal.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + vm := azureEnvNonZonal.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM + Expect(vm.Zones).To(BeEmpty()) + } }) - // Ported from VM test: "should support provisioning non-zonal instance types in zonal regions" It("should support provisioning non-zonal instance types in zonal regions", func() { coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ Key: v1.LabelInstanceTypeStable, Operator: v1.NodeSelectorOpIn, - Values: []string{"Standard_NC6s_v3"}, // Non-zonal instance type + Values: []string{"Standard_NC6s_v3"}, }) ExpectApplied(ctx, env.Client, nodePool, nodeClass) @@ -389,190 +632,81 @@ func runAKSMachineAPIOfferingTests() { node := ExpectScheduled(ctx, env.Client, pod) Expect(node.Labels).To(HaveKeyWithValue(v1.LabelTopologyZone, zones.Regional)) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine - Expect(aksMachine.Zones).To(BeEmpty()) - }) - }) - - // Ported from VM test: "CloudProvider Create Error Cases" - Context("Create - CloudProvider Create Error Cases", func() { - // Ported from VM test: "should return an ICE error when there are no instance types to launch" - // But, from cloudprovider/suite_test.go rather than instancetype/suite_test.go - It("should return an ICE error when there are no instance types to launch", func() { - // Specify no instance types and expect to receive a capacity error - nodeClaim.Spec.Requirements = []karpv1.NodeSelectorRequirementWithMinValues{ - { - Key: v1.LabelInstanceTypeStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{"doesnotexist"}, // will not match any instance types, - }, + if provisionMode.isAKSMachineMode() { + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine + Expect(aksMachine.Zones).To(BeEmpty()) + } else { + Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM + Expect(vm.Zones).To(BeEmpty()) } - - ExpectApplied(ctx, env.Client, nodePool, nodeClass, nodeClaim) - cloudProviderMachine, err := CreateAndWaitForPromises(ctx, cloudProvider, azureEnv, nodeClaim) - Expect(corecloudprovider.IsInsufficientCapacityError(err)).To(BeTrue()) - Expect(cloudProviderMachine).To(BeNil()) }) - // Ported from VM test: "should return error when NodeClass readiness is Unknown" - It("should return error when NodeClass readiness is Unknown", func() { - nodeClass.StatusConditions().SetUnknown(corestatus.ConditionReady) - testNodeClaim2 := coretest.NodeClaim(karpv1.NodeClaim{ - ObjectMeta: metav1.ObjectMeta{ - Labels: map[string]string{ - karpv1.NodePoolLabelKey: nodePool.Name, - }, - }, - Spec: karpv1.NodeClaimSpec{ - NodeClassRef: &karpv1.NodeClassReference{ - Name: nodeClass.Name, - Group: object.GVK(nodeClass).Group, - Kind: object.GVK(nodeClass).Kind, + It("should schedule pods with zonal topology spread when non-zonal SKUs exist", func() { + podLabels := map[string]string{"app": "tsc-repro"} + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + pods := []*v1.Pod{} + for i := 0; i < 3; i++ { + pods = append(pods, coretest.UnschedulablePod(coretest.PodOptions{ + ObjectMeta: metav1.ObjectMeta{Labels: podLabels}, + TopologySpreadConstraints: []v1.TopologySpreadConstraint{ + { + MaxSkew: 1, + TopologyKey: v1.LabelTopologyZone, + WhenUnsatisfiable: v1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{MatchLabels: podLabels}, + }, }, - }, - }) - - ExpectApplied(ctx, env.Client, nodePool, nodeClass, testNodeClaim2) - claim, err := CreateAndWaitForPromises(ctx, cloudProvider, azureEnv, testNodeClaim2) - Expect(err).To(HaveOccurred()) - Expect(err).To(BeAssignableToTypeOf(&corecloudprovider.CreateError{})) - Expect(claim).To(BeNil()) - Expect(err.Error()).To(ContainSubstring("resolving NodeClass readiness, NodeClass is in Ready=Unknown")) + })) + } + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pods...) + for _, pod := range pods { + ExpectScheduled(ctx, env.Client, pod) + } }) - // Ported from VM test: "should return error when instance type resolution fails" - It("should return error when instance type resolution fails", func() { - // Create and set up the status controller - localStatusController := status.NewController(env.Client, azureEnv.KubernetesVersionProvider, azureEnv.ImageProvider, env.KubernetesInterface, env.KubernetesInterface, azureEnv.DynamicInterface, azureEnv.SubnetsAPI, azureEnv.DiskEncryptionSetsAPI, testOptions.ParsedDiskEncryptionSetID, options.FromContext(ctx).NetworkPolicy, options.FromContext(ctx).NetworkPlugin) - - // Set NodeClass to Ready - nodeClass.StatusConditions().SetTrue(karpv1.ConditionTypeLaunched) - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - - // Reconcile the NodeClass to ensure status is updated - ExpectObjectReconciled(ctx, env.Client, localStatusController, nodeClass) - - // Flush the cache to simulate the controller not having run yet. - // With the instance type controller, SKU API errors happen during - // UpdateInstanceTypes (controller reconcile), not during List. - // When the cache is empty, List returns an error. - azureEnv.InstanceTypesProvider.Reset() - - testNodeClaim3 := coretest.NodeClaim(karpv1.NodeClaim{ - ObjectMeta: metav1.ObjectMeta{ - Labels: map[string]string{ - karpv1.NodePoolLabelKey: nodePool.Name, - }, - }, - Spec: karpv1.NodeClaimSpec{ - NodeClassRef: &karpv1.NodeClassReference{ - Name: nodeClass.Name, - Group: object.GVK(nodeClass).Group, - Kind: object.GVK(nodeClass).Kind, - }, + It("should exclude non-zonal instance types via zone NodePool requirements", func() { + coretest.ReplaceRequirements(nodePool, + karpv1.NodeSelectorRequirementWithMinValues{ + Key: v1.LabelInstanceTypeStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{"Standard_NC6s_v3"}}, + karpv1.NodeSelectorRequirementWithMinValues{ + Key: v1.LabelTopologyZone, + Operator: v1.NodeSelectorOpIn, + Values: []string{fakeZone1}, }, - }) - - claim, err := CreateAndWaitForPromises(ctx, cloudProvider, azureEnv, testNodeClaim3) - Expect(err).To(HaveOccurred()) - Expect(err).To(BeAssignableToTypeOf(&corecloudprovider.CreateError{})) - Expect(claim).To(BeNil()) - Expect(err.Error()).To(ContainSubstring("resolving instance types")) + ) + ExpectApplied(ctx, env.Client, nodePool, nodeClass) - // Reset instance types - Expect(azureEnv.InstanceTypesProvider.UpdateInstanceTypes(ctx)).To(Succeed()) + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectNotScheduled(ctx, env.Client, pod) }) - // Ported from VM test: "should return error when instance creation fails" - It("should return error when instance creation fails", func() { - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - - // Create a NodeClaim with valid requirements - testNodeClaim4 := coretest.NodeClaim(karpv1.NodeClaim{ - ObjectMeta: metav1.ObjectMeta{ - Labels: map[string]string{ - karpv1.NodePoolLabelKey: nodePool.Name, - }, - }, - Spec: karpv1.NodeClaimSpec{ - NodeClassRef: &karpv1.NodeClassReference{ - Name: nodeClass.Name, - Group: object.GVK(nodeClass).Group, - Kind: object.GVK(nodeClass).Kind, - }, + It("should exclude non-zonal instance types when all real zones are specified", func() { + coretest.ReplaceRequirements(nodePool, + karpv1.NodeSelectorRequirementWithMinValues{ + Key: v1.LabelInstanceTypeStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{"Standard_NC6s_v3"}}, + karpv1.NodeSelectorRequirementWithMinValues{ + Key: v1.LabelTopologyZone, + Operator: v1.NodeSelectorOpIn, + Values: azureEnv.Zones(), }, - }) - - // Set up the AKS machine provider to fail (different from VM API) - azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorAny() + ) + ExpectApplied(ctx, env.Client, nodePool, nodeClass) - claim, err := CreateAndWaitForPromises(ctx, cloudProvider, azureEnv, testNodeClaim4) - Expect(err).To(HaveOccurred()) - Expect(err).To(BeAssignableToTypeOf(&corecloudprovider.CreateError{})) - Expect(claim).To(BeNil()) - Expect(err.Error()).To(ContainSubstring("creating AKS machine failed")) + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectNotScheduled(ctx, env.Client, pod) }) }) - // Mostly ported from VM test: "Provider list" - Context("Create - Provider list", func() { - // Ported from VM test: "should support individual instance type labels" - // TODO(mattchr): rework this from VM test (new additions) - // It("should support individual instance type labels", func() { - // ExpectApplied(ctx, env.Client, nodePool, nodeClass) - - // nodeSelector := map[string]string{ - // // Well known - // v1.LabelTopologyRegion: fake.Region, - // karpv1.NodePoolLabelKey: nodePool.Name, - // v1.LabelTopologyZone: fakeZone1, - // v1.LabelInstanceTypeStable: "Standard_NC24ads_A100_v4", - // v1.LabelOSStable: "linux", - // v1.LabelArchStable: "amd64", - // karpv1.CapacityTypeLabelKey: "on-demand", - // // Well Known to AKS - // v1beta1.LabelSKUName: "Standard_NC24ads_A100_v4", - // v1beta1.LabelSKUFamily: "N", - // v1beta1.LabelSKUVersion: "4", - // v1beta1.LabelSKUStorageEphemeralOSMaxSize: "429", - // v1beta1.LabelSKUAcceleratedNetworking: "true", - // v1beta1.LabelSKUStoragePremiumCapable: "true", - // v1beta1.LabelSKUGPUName: "A100", - // v1beta1.LabelSKUGPUManufacturer: "nvidia", - // v1beta1.LabelSKUGPUCount: "1", - // v1beta1.LabelSKUCPU: "24", - // v1beta1.LabelSKUMemory: "8192", - // // Deprecated Labels - // v1.LabelFailureDomainBetaRegion: fake.Region, - // v1.LabelFailureDomainBetaZone: fakeZone1, - // "beta.kubernetes.io/arch": "amd64", - // "beta.kubernetes.io/os": "linux", - // v1.LabelInstanceType: "Standard_NC24ads_A100_v4", - // "topology.disk.csi.azure.com/zone": fakeZone1, - // v1.LabelWindowsBuild: "window", - // // Cluster Label - // v1beta1.AKSLabelCluster: "test-cluster", - // } - - // // Ensure that we're exercising all well known labels - // Expect(lo.Keys(nodeSelector)).To(ContainElements(append(karpv1.WellKnownLabels.UnsortedList(), lo.Keys(karpv1.NormalizedLabels)...))) - - // var pods []*v1.Pod - // for key, value := range nodeSelector { - // pods = append(pods, coretest.UnschedulablePod(coretest.PodOptions{NodeSelector: map[string]string{key: value}})) - // } - // ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pods...) - // for _, pod := range pods { - // ExpectScheduled(ctx, env.Client, pod) - // } - // }) - }) - - // Ported from VM test: "Unavailable Offerings" Context("Create - Unavailable Offerings", func() { - // Ported from VM test: "should not allocate a vm in a zone marked as unavailable" - It("should not allocate an AKS machine in a zone marked as unavailable", func() { + It("should not allocate an instance in a zone marked as unavailable", func() { azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "ZonalAllocationFailure", fake.MakeSKU("Standard_D2_v2"), fakeZone1, karpv1.CapacityTypeSpot) azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "ZonalAllocationFailure", fake.MakeSKU("Standard_D2_v2"), fakeZone1, karpv1.CapacityTypeOnDemand) coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ @@ -588,23 +722,48 @@ func runAKSMachineAPIOfferingTests() { Expect(node.Labels[v1.LabelInstanceTypeStable]).To(Equal("Standard_D2_v2")) }) - // Ported from VM test: "should handle ZonalAllocationFailed on creating the VM" - It("should handle ZonalAllocationFailed on creating the AKS machine", func() { - // Set up async error via BOTH Error and Output (LRO returns both) - azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorZoneAllocationFailed("Standard_D2_v2", "1") - - coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1.LabelInstanceTypeStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{"Standard_D2_v2"}}) - - ExpectApplied(ctx, env.Client, nodePool, nodeClass) + It("should list nodeclaim with correct instance type even after capacity error marks offerings unavailable", func() { + ExpectApplied(ctx, env.Client, nodeClass, nodePool) pod := coretest.UnschedulablePod() ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectNotScheduled(ctx, env.Client, pod) + ExpectScheduled(ctx, env.Client, pod) - By("marking whatever zone was picked as unavailable - for both spot and on-demand") - // When ZonalAllocationFailed error is encountered, we block all VM sizes that have >= vCPUs as the VM size for which we encountered the error + var vmSize string + if provisionMode.isAKSMachineMode() { + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + aksMachine := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop().AKSMachine + vmSize = lo.FromPtr(aksMachine.Properties.Hardware.VMSize) + } else { + Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM + vmSize = string(lo.FromPtr(vm.Properties.HardwareProfile.VMSize)) + } + Expect(vmSize).ToNot(BeEmpty()) + + for _, zone := range azureEnv.Zones() { + azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "ZonalAllocationFailure", fake.MakeSKU(vmSize), zone, karpv1.CapacityTypeOnDemand) + azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "ZonalAllocationFailure", fake.MakeSKU(vmSize), zone, karpv1.CapacityTypeSpot) + } + + nodeClaims, err := cloudProvider.List(ctx) + Expect(err).ToNot(HaveOccurred()) + Expect(nodeClaims).To(HaveLen(1)) + if provisionMode.isAKSMachineMode() { + validateAKSMachineNodeClaim(nodeClaims[0], nodePool) + } else { + validateVMNodeClaim(nodeClaims[0], nodePool) + } + Expect(nodeClaims[0].Labels[v1.LabelInstanceTypeStable]).To(Equal(vmSize)) + }) + + It("should handle ZonalAllocationFailed on creating the instance", func() { + if provisionMode.isAKSMachineMode() { + azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorZoneAllocationFailed("Standard_D2_v2", "1") + } else { + azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.Error.Set( + &azcore.ResponseError{ErrorCode: sdkerrors.ZoneAllocationFailed}, + ) + } expectedUnavailableSKUs := []*skewer.SKU{ { Name: lo.ToPtr("Standard_D2_v2"), @@ -640,83 +799,66 @@ func runAKSMachineAPIOfferingTests() { }, }, } + coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ + Key: v1.LabelInstanceTypeStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{"Standard_D2_v2"}}) - // For AKS Machine API, we need to determine the zone from the machine creation attempt - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(BeNumerically(">", 0)) - machineInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + pod := coretest.UnschedulablePod() + if provisionMode.isAKSMachineMode() { + azureEnv.AKSAgentPoolsAPI.AgentPoolDeleteMachinesBehavior.CalledWithInput.Reset() + } else { + azureEnv.VirtualMachinesAPI.VirtualMachineGetBehavior.CalledWithInput.Reset() + } + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + retryPod := pod + if provisionMode.isAKSMachineMode() { + ExpectNotScheduled(ctx, env.Client, pod) + Expect(azureEnv.AKSAgentPoolsAPI.AgentPoolDeleteMachinesBehavior.CalledWithInput.Len()).To(BeNumerically(">", 0)) + } else { + // Problem: ExpectProvisionedAndWaitForPromises can bind this pod before the scriptless async VM poll failure is observed. + // Cleanup is still attempted: VM Delete first does a Get, but this fake poll failure never stores a VM, so BeginDelete cannot be asserted. + // TODO: Make the fake scriptless async VM failure store the VM before poll failure so this can assert unscheduled pod state and VM BeginDelete. + Expect(azureEnv.VirtualMachinesAPI.VirtualMachineGetBehavior.CalledWithInput.Len()).To(BeNumerically(">=", 2)) + } - // Extract zone from AKS machine - similar to VM test pattern - failedZone, err := instance.GetAKSLabelZoneFromAKSMachine(&machineInput.AKSMachine, fake.Region) + By("marking whatever zone was picked as unavailable - for both spot and on-demand") + var zone string + var err error + if provisionMode.isAKSMachineMode() { + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(BeNumerically(">", 0)) + machineInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + zone, err = instance.GetAKSLabelZoneFromAKSMachine(&machineInput.AKSMachine, fake.Region) + } else { + zone, err = zones.MakeAKSLabelZoneFromVM(&azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM) + } Expect(err).ToNot(HaveOccurred()) - for _, skuToCheck := range expectedUnavailableSKUs { - Expect(azureEnv.UnavailableOfferingsCache.IsUnavailable(skuToCheck, failedZone, karpv1.CapacityTypeSpot)).To(BeTrue()) - Expect(azureEnv.UnavailableOfferingsCache.IsUnavailable(skuToCheck, failedZone, karpv1.CapacityTypeOnDemand)).To(BeTrue()) + Expect(azureEnv.UnavailableOfferingsCache.IsUnavailable(skuToCheck, zone, karpv1.CapacityTypeSpot)).To(BeTrue()) + Expect(azureEnv.UnavailableOfferingsCache.IsUnavailable(skuToCheck, zone, karpv1.CapacityTypeOnDemand)).To(BeTrue()) } By("successfully scheduling in a different zone on retry") - // Clear the error and verify retry succeeds in different zone - azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = nil - - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - node := ExpectScheduled(ctx, env.Client, pod) - - // Verify machine was created in a different zone than the failed one - Expect(node.Labels[v1.LabelTopologyZone]).ToNot(Equal(failedZone)) - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(BeNumerically(">", 0)) - }) - - // Ported from VM test: DescribeTable "Should not return unavailable offerings" - Context("should not return unavailable offerings", func() { - It("should leave regional offerings available when all real zones are unavailable", func() { - for _, zone := range azureEnv.Zones() { - azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_D2_v2"), zone, karpv1.CapacityTypeSpot) - azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_D2_v2"), zone, karpv1.CapacityTypeOnDemand) - } - instanceTypes, err := azureEnv.InstanceTypesProvider.List(ctx, nodeClass) - Expect(err).ToNot(HaveOccurred()) - - seeUnavailable := false - for _, instanceType := range instanceTypes { - if instanceType.Name == "Standard_D2_v2" { - // We want to validate we see the offering in the list, - // but only the regional offerings should remain available. - seeUnavailable = true - Expect(lo.Map(instanceType.Offerings.Available(), func(offering *corecloudprovider.Offering, _ int) string { - return offering.Requirements.Get(v1.LabelTopologyZone).Any() - })).To(ConsistOf(zones.Regional, zones.Regional)) - } else { - Expect(len(instanceType.Offerings.Available())).To(Not(Equal(0))) - } - } - // we should see the unavailable offering in the list - Expect(seeUnavailable).To(BeTrue()) - }) - It("should not return unavailable offerings - non-zonal", func() { - for _, zone := range azureEnvNonZonal.Zones() { - azureEnvNonZonal.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_D2_v2"), zone, karpv1.CapacityTypeSpot) - azureEnvNonZonal.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_D2_v2"), zone, karpv1.CapacityTypeOnDemand) - } - instanceTypes, err := azureEnvNonZonal.InstanceTypesProvider.List(ctx, nodeClass) - Expect(err).ToNot(HaveOccurred()) - - seeUnavailable := false - for _, instanceType := range instanceTypes { - if instanceType.Name == "Standard_D2_v2" { - // We want to validate we see the offering in the list, - // but we also expect it to not have any available offerings - seeUnavailable = true - Expect(len(instanceType.Offerings.Available())).To(Equal(0)) - } else { - Expect(len(instanceType.Offerings.Available())).To(Not(Equal(0))) - } - } - // we should see the unavailable offering in the list - Expect(seeUnavailable).To(BeTrue()) - }) + if provisionMode.isAKSMachineMode() { + azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = nil + } else { + azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.Error.Set(nil) + // This is the same core test-helper limitation as above: scriptless can observe the VM poll error only after the helper has already fake-scheduled the pod. + // That scheduled original pod is unusable for validating scriptless retry, because retrying it could pass by reusing the fake-bound failed attempt instead of provisioning again. + // Machine API keeps the original pod pending in this scenario, so it can retry the same pod; scriptless uses a fresh pod constrained away from the failed zone to force a real retry. + retryZone, ok := lo.Find(azureEnv.Zones(), func(candidate string) bool { return candidate != zone }) + Expect(ok).To(BeTrue()) + retryPod = coretest.UnschedulablePod(coretest.PodOptions{NodeSelector: map[string]string{v1.LabelTopologyZone: retryZone}}) + } + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, retryPod) + node := ExpectScheduled(ctx, env.Client, retryPod) + Expect(node.Labels[v1.LabelTopologyZone]).ToNot(Equal(zone)) + if provisionMode.isAKSMachineMode() { + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(BeNumerically(">", 0)) + } }) - // Ported from VM test: "should launch instances in a different zone than preferred" It("should launch instances in a different zone than preferred when zone is unavailable", func() { azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "ZonalAllocationFailure", fake.MakeSKU("Standard_D2_v2"), fakeZone1, karpv1.CapacityTypeOnDemand) azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "ZonalAllocationFailure", fake.MakeSKU("Standard_D2_v2"), fakeZone1, karpv1.CapacityTypeSpot) @@ -747,7 +889,6 @@ func runAKSMachineAPIOfferingTests() { Expect(node.Labels[v1.LabelInstanceTypeStable]).To(Equal("Standard_D2_v2")) }) - // Ported from VM test: "should launch smaller instances than optimal if larger instance launch results in Insufficient Capacity Error" It("should launch smaller instances than optimal if larger instance launch results in Insufficient Capacity Error", func() { azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_F16s_v2"), fakeZone1, karpv1.CapacityTypeOnDemand) azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_F16s_v2"), fakeZone1, karpv1.CapacityTypeSpot) @@ -766,7 +907,6 @@ func runAKSMachineAPIOfferingTests() { }, })) } - // Provisions 2 smaller instances since larger was ICE'd ExpectApplied(ctx, env.Client, nodeClass, nodePool) ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pods...) @@ -779,121 +919,168 @@ func runAKSMachineAPIOfferingTests() { Expect(nodeNames.Len()).To(Equal(2)) }) - // Ported from VM test: "should launch instances on later reconciliation attempt with Insufficient Capacity Error Cache expiry" - Context("should launch instances on later reconciliation attempt with Insufficient Capacity Error Cache expiry", func() { - It("should launch instances on later reconciliation attempt with Insufficient Capacity Error Cache expiry - zonal", func() { - for _, zone := range azureEnv.Zones() { - azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_D2_v2"), zone, karpv1.CapacityTypeSpot) - azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_D2_v2"), zone, karpv1.CapacityTypeOnDemand) + DescribeTable("should launch instances on later reconciliation attempt with Insufficient Capacity Error Cache expiry", + func(nonZonal bool) { + azEnv := azureEnv + targetCluster := cluster + targetCloudProvider := cloudProvider + targetProvisioner := coreProvisioner + if nonZonal { + azEnv = azureEnvNonZonal + targetCluster = clusterNonZonal + targetCloudProvider = cloudProviderNonZonal + targetProvisioner = coreProvisionerNonZonal } - azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_D2_v2"), zones.Regional, karpv1.CapacityTypeSpot) - azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_D2_v2"), zones.Regional, karpv1.CapacityTypeOnDemand) - ExpectApplied(ctx, env.Client, nodeClass, nodePool) - pod := coretest.UnschedulablePod(coretest.PodOptions{ - NodeSelector: map[string]string{v1.LabelInstanceTypeStable: "Standard_D2_v2"}, - }) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectNotScheduled(ctx, env.Client, pod) - - // capacity shortage is over - expire the items from the cache and try again - azureEnv.UnavailableOfferingsCache.Flush() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - node := ExpectScheduled(ctx, env.Client, pod) - Expect(node.Labels).To(HaveKeyWithValue(v1.LabelInstanceTypeStable, "Standard_D2_v2")) - }) - It("should launch instances on later reconciliation attempt with Insufficient Capacity Error Cache expiry - non-zonal", func() { - for _, zone := range azureEnvNonZonal.Zones() { - azureEnvNonZonal.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_D2_v2"), zone, karpv1.CapacityTypeSpot) - azureEnvNonZonal.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_D2_v2"), zone, karpv1.CapacityTypeOnDemand) + for _, zone := range azEnv.Zones() { + azEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_D2_v2"), zone, karpv1.CapacityTypeSpot) + azEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_D2_v2"), zone, karpv1.CapacityTypeOnDemand) + } + if !nonZonal { + azEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_D2_v2"), zones.Regional, karpv1.CapacityTypeSpot) + azEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_D2_v2"), zones.Regional, karpv1.CapacityTypeOnDemand) } ExpectApplied(ctx, env.Client, nodeClass, nodePool) pod := coretest.UnschedulablePod(coretest.PodOptions{ NodeSelector: map[string]string{v1.LabelInstanceTypeStable: "Standard_D2_v2"}, }) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, clusterNonZonal, cloudProviderNonZonal, coreProvisionerNonZonal, azureEnvNonZonal, pod) + ExpectProvisionedAndWaitForPromises(ctx, env.Client, targetCluster, targetCloudProvider, targetProvisioner, azEnv, pod) ExpectNotScheduled(ctx, env.Client, pod) - // capacity shortage is over - expire the items from the cache and try again - azureEnvNonZonal.UnavailableOfferingsCache.Flush() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, clusterNonZonal, cloudProviderNonZonal, coreProvisionerNonZonal, azureEnvNonZonal, pod) + azEnv.UnavailableOfferingsCache.Flush() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, targetCluster, targetCloudProvider, targetProvisioner, azEnv, pod) node := ExpectScheduled(ctx, env.Client, pod) Expect(node.Labels).To(HaveKeyWithValue(v1.LabelInstanceTypeStable, "Standard_D2_v2")) - }) + }, + Entry("zonal", false), + Entry("non-zonal", true), + ) + + It("should mark SKU as unavailable in all zones for Spot", func() { + if provisionMode.isAKSMachineMode() { + azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorSkuNotAvailable(defaultTestSKU.GetName(), fake.Region) + } else { + azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set( + &azcore.ResponseError{ErrorCode: sdkerrors.SKUNotAvailableErrorCode}, + ) + } + + coretest.ReplaceRequirements(nodePool, + karpv1.NodeSelectorRequirementWithMinValues{ + Key: v1.LabelInstanceTypeStable, Operator: v1.NodeSelectorOpIn, Values: []string{defaultTestSKU.GetName()}}, + karpv1.NodeSelectorRequirementWithMinValues{ + Key: karpv1.CapacityTypeLabelKey, Operator: v1.NodeSelectorOpIn, Values: []string{karpv1.CapacityTypeSpot}}, + ) + ExpectApplied(ctx, env.Client, nodeClass, nodePool) + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectNotScheduled(ctx, env.Client, pod) + for _, zoneID := range []string{"1", "2", "3"} { + ExpectUnavailable(azureEnv, defaultTestSKU, zones.MakeAKSLabelZoneFromARMZone(fake.Region, zoneID), karpv1.CapacityTypeSpot) + } }) - // Ported from VM test context: "SkuNotAvailable" - Context("SKUNotAvailable", func() { - AssertUnavailable := func(sku *skewer.SKU, capacityType string) { - // Simulate SKU not available error via AKS Machine API - azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorSkuNotAvailable(sku.GetName(), fake.Region) - - coretest.ReplaceRequirements(nodePool, - karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1.LabelInstanceTypeStable, Operator: v1.NodeSelectorOpIn, Values: []string{sku.GetName()}}, - karpv1.NodeSelectorRequirementWithMinValues{ - Key: karpv1.CapacityTypeLabelKey, Operator: v1.NodeSelectorOpIn, Values: []string{capacityType}}, + It("should mark SKU as unavailable in all zones for OnDemand", func() { + if provisionMode.isAKSMachineMode() { + azureEnv.AKSMachinesAPI.AfterPollProvisioningErrorOverride = fake.AKSMachineAPIProvisioningErrorSkuNotAvailable(defaultTestSKU.GetName(), fake.Region) + } else { + azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set( + &azcore.ResponseError{ErrorCode: sdkerrors.SKUNotAvailableErrorCode}, ) - ExpectApplied(ctx, env.Client, nodeClass, nodePool) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectNotScheduled(ctx, env.Client, pod) - for _, zoneID := range []string{"1", "2", "3"} { - ExpectUnavailable(azureEnv, sku, zones.MakeAKSLabelZoneFromARMZone(fake.Region, zoneID), capacityType) - } } - // Ported from VM test: "should mark SKU as unavailable in all zones for Spot" - It("should mark SKU as unavailable in all zones for Spot", func() { - AssertUnavailable(defaultTestSKU, karpv1.CapacityTypeSpot) - }) - - // Ported from VM test: "should mark SKU as unavailable in all zones for OnDemand" - It("should mark SKU as unavailable in all zones for OnDemand", func() { - AssertUnavailable(defaultTestSKU, karpv1.CapacityTypeOnDemand) - }) + coretest.ReplaceRequirements(nodePool, + karpv1.NodeSelectorRequirementWithMinValues{ + Key: v1.LabelInstanceTypeStable, Operator: v1.NodeSelectorOpIn, Values: []string{defaultTestSKU.GetName()}}, + karpv1.NodeSelectorRequirementWithMinValues{ + Key: karpv1.CapacityTypeLabelKey, Operator: v1.NodeSelectorOpIn, Values: []string{karpv1.CapacityTypeOnDemand}}, + ) + ExpectApplied(ctx, env.Client, nodeClass, nodePool) + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectNotScheduled(ctx, env.Client, pod) + for _, zoneID := range []string{"1", "2", "3"} { + ExpectUnavailable(azureEnv, defaultTestSKU, zones.MakeAKSLabelZoneFromARMZone(fake.Region, zoneID), karpv1.CapacityTypeOnDemand) + } }) - // This is from AKS RP frontend errors rather then CRP - Context("SKUNotAvailable - AKS Machine API sync phase", func() { - AssertUnavailableSync := func(syncErr *azcore.ResponseError, sku *skewer.SKU, capacityType string) { - azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.BeginError.Set(syncErr) - - coretest.ReplaceRequirements(nodePool, - karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1.LabelInstanceTypeStable, Operator: v1.NodeSelectorOpIn, Values: []string{sku.GetName()}}, - karpv1.NodeSelectorRequirementWithMinValues{ - Key: karpv1.CapacityTypeLabelKey, Operator: v1.NodeSelectorOpIn, Values: []string{capacityType}}, - ) - ExpectApplied(ctx, env.Client, nodeClass, nodePool) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectNotScheduled(ctx, env.Client, pod) - for _, zoneID := range []string{"1", "2", "3"} { - ExpectUnavailable(azureEnv, sku, zones.MakeAKSLabelZoneFromARMZone(fake.Region, zoneID), capacityType) + // This is from AKS RP frontend errors rather then CRP (in which Scriptless is calling). + if provisionMode.isAKSMachineMode() { + Context("SKUNotAvailable - AKS Machine API sync phase", func() { + AssertUnavailableSync := func(syncErr *azcore.ResponseError, sku *skewer.SKU, capacityType string) { + azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.BeginError.Set(syncErr) + + coretest.ReplaceRequirements(nodePool, + karpv1.NodeSelectorRequirementWithMinValues{ + Key: v1.LabelInstanceTypeStable, Operator: v1.NodeSelectorOpIn, Values: []string{sku.GetName()}}, + karpv1.NodeSelectorRequirementWithMinValues{ + Key: karpv1.CapacityTypeLabelKey, Operator: v1.NodeSelectorOpIn, Values: []string{capacityType}}, + ) + ExpectApplied(ctx, env.Client, nodeClass, nodePool) + pod := coretest.UnschedulablePod() + ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) + ExpectNotScheduled(ctx, env.Client, pod) + for _, zoneID := range []string{"1", "2", "3"} { + ExpectUnavailable(azureEnv, sku, zones.MakeAKSLabelZoneFromARMZone(fake.Region, zoneID), capacityType) + } } - } - It("should handle VMSizeNotSupported sync error and mark SKU unavailable", func() { - AssertUnavailableSync( - fake.AKSMachineAPIErrorVMSizeNotSupported(lo.FromPtr(defaultTestSKU.Name), azureEnv.SubscriptionID, fake.Region), - defaultTestSKU, karpv1.CapacityTypeOnDemand, - ) - }) + It("should handle VMSizeNotSupported sync error and mark SKU unavailable", func() { + AssertUnavailableSync( + fake.AKSMachineAPIErrorVMSizeNotSupported(lo.FromPtr(defaultTestSKU.Name), azureEnv.SubscriptionID, fake.Region), + defaultTestSKU, karpv1.CapacityTypeOnDemand, + ) + }) - It("should handle BadRequest 'not supported for subscription' sync error and mark SKU unavailable", func() { - AssertUnavailableSync( - fake.AKSMachineAPIErrorVMSizeNotSupportedBadRequest(lo.FromPtr(defaultTestSKU.Name), azureEnv.SubscriptionID, fake.Region), - defaultTestSKU, karpv1.CapacityTypeSpot, - ) + It("should handle BadRequest 'not supported for subscription' sync error and mark SKU unavailable", func() { + AssertUnavailableSync( + fake.AKSMachineAPIErrorVMSizeNotSupportedBadRequest(lo.FromPtr(defaultTestSKU.Name), azureEnv.SubscriptionID, fake.Region), + defaultTestSKU, karpv1.CapacityTypeSpot, + ) + }) }) - }) + } }) - } var _ = Describe("CloudProvider", func() { + Context("ProvisionMode = AKSScriptless, ManageExistingAKSMachines = false", func() { + BeforeEach(func() { + testOptions = test.Options(test.OptionsFields{ + ProvisionMode: lo.ToPtr(consts.ProvisionModeAKSScriptless), + ManageExistingAKSMachines: lo.ToPtr(false), + }) + ctx = coreoptions.ToContext(ctx, coretest.Options()) + ctx = options.ToContext(ctx, testOptions) + + azureEnv = test.NewEnvironment(ctx, env) + azureEnvNonZonal = test.NewEnvironmentNonZonal(ctx, env) + statusController = status.NewController(env.Client, azureEnv.KubernetesVersionProvider, azureEnv.ImageProvider, env.KubernetesInterface, env.KubernetesInterface, azureEnv.DynamicInterface, azureEnv.SubnetsAPI, azureEnv.DiskEncryptionSetsAPI, testOptions.ParsedDiskEncryptionSetID, options.FromContext(ctx).NetworkPolicy, options.FromContext(ctx).NetworkPlugin) + test.ApplyDefaultStatus(nodeClass, env, testOptions.UseSIG) + cloudProvider = New(azureEnv.InstanceTypesProvider, azureEnv.VMInstanceProvider, azureEnv.AKSMachineProvider, recorder, env.Client, azureEnv.ImageProvider, azureEnv.InstanceTypeStore) + cloudProviderNonZonal = New(azureEnvNonZonal.InstanceTypesProvider, azureEnvNonZonal.VMInstanceProvider, azureEnvNonZonal.AKSMachineProvider, events.NewRecorder(&record.FakeRecorder{}), env.Client, azureEnvNonZonal.ImageProvider, azureEnvNonZonal.InstanceTypeStore) + + cluster = state.NewCluster(fakeClock, env.Client, cloudProvider) + clusterNonZonal = state.NewCluster(fakeClock, env.Client, cloudProviderNonZonal) + coreProvisioner = provisioning.NewProvisioner(env.Client, recorder, cloudProvider, cluster, fakeClock) + coreProvisionerNonZonal = provisioning.NewProvisioner(env.Client, recorder, cloudProviderNonZonal, clusterNonZonal, fakeClock) + + ExpectApplied(ctx, env.Client, nodeClass, nodePool) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + }) + + AfterEach(func() { + cloudProvider.WaitForInstancePromises() + cluster.Reset() + clusterNonZonal.Reset() + azureEnv.Reset(ctx) + azureEnvNonZonal.Reset(ctx) + }) + + runOfferingTests(aksscriptlessProvisionMode()) + }) + Context("ProvisionMode = AKSMachineAPIHeaderBatch", func() { BeforeEach(func() { testOptions = test.Options(test.OptionsFields{ @@ -928,6 +1115,6 @@ var _ = Describe("CloudProvider", func() { azureEnvNonZonal.Reset(ctx) }) - runAKSMachineAPIOfferingTests() + runOfferingTests(aksMachineAPIHeaderBatchProvisionMode()) }) }) diff --git a/pkg/cloudprovider/suite_test.go b/pkg/cloudprovider/suite_test.go index ceb47cae7..9986b984d 100644 --- a/pkg/cloudprovider/suite_test.go +++ b/pkg/cloudprovider/suite_test.go @@ -18,25 +18,21 @@ package cloudprovider // TODO v1beta1 extra refactor into suite_test.go / cloudprovider_test.go import ( + "bytes" "context" + "fmt" + "io" "testing" "time" - . "github.com/Azure/karpenter-provider-azure/pkg/test/expectations" "github.com/awslabs/operatorpkg/object" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" - "github.com/samber/lo" v1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/tools/record" clock "k8s.io/utils/clock/testing" - "sigs.k8s.io/controller-runtime/pkg/client" karpv1 "sigs.k8s.io/karpenter/pkg/apis/v1" - karpv1alpha1 "sigs.k8s.io/karpenter/pkg/apis/v1alpha1" - corecloudprovider "sigs.k8s.io/karpenter/pkg/cloudprovider" - "sigs.k8s.io/karpenter/pkg/controllers/nodeoverlay" "sigs.k8s.io/karpenter/pkg/controllers/provisioning" "sigs.k8s.io/karpenter/pkg/controllers/state" "sigs.k8s.io/karpenter/pkg/events" @@ -48,11 +44,9 @@ import ( "github.com/Azure/karpenter-provider-azure/pkg/apis" "github.com/Azure/karpenter-provider-azure/pkg/apis/v1beta1" - "github.com/Azure/karpenter-provider-azure/pkg/consts" "github.com/Azure/karpenter-provider-azure/pkg/controllers/nodeclass/status" "github.com/Azure/karpenter-provider-azure/pkg/fake" "github.com/Azure/karpenter-provider-azure/pkg/operator/options" - "github.com/Azure/karpenter-provider-azure/pkg/providers/instance" "github.com/Azure/karpenter-provider-azure/pkg/test" "github.com/Azure/karpenter-provider-azure/pkg/utils/zones" ) @@ -86,6 +80,10 @@ func TestCloudProvider(t *testing.T) { RunSpecs(t, "cloudProvider/Azure") } +func createSDKErrorBody(code, message string) io.ReadCloser { + return io.NopCloser(bytes.NewReader([]byte(fmt.Sprintf(`{"error":{"code": "%s", "message": "%s"}}`, code, message)))) +} + var _ = BeforeSuite(func() { env = coretest.NewEnvironment(coretest.WithCRDs(apis.CRDs...), coretest.WithCRDs(v1alpha1.CRDs...), coretest.WithFieldIndexers(coretest.NodeProviderIDFieldIndexer(ctx))) ctx = coreoptions.ToContext(ctx, coretest.Options()) @@ -180,87 +178,59 @@ func validateVMNodeClaim(nodeClaim *karpv1.NodeClaim, nodePool *karpv1.NodePool) Expect(nodeClaim.Annotations).ToNot(HaveKey(v1beta1.AnnotationAKSMachineResourceID)) } -func reconcileCapacityOverlay(customResource v1.ResourceName, overlayCapacity resource.Quantity) { - GinkgoHelper() - nodeOverlay := coretest.NodeOverlay(karpv1alpha1.NodeOverlay{ - Spec: karpv1alpha1.NodeOverlaySpec{ - Requirements: []karpv1alpha1.NodeSelectorRequirement{{ - Key: karpv1.NodePoolLabelKey, - Operator: v1.NodeSelectorOpIn, - Values: []string{nodePool.Name}, - }}, - Capacity: v1.ResourceList{customResource: overlayCapacity}, - }, - }) - ExpectApplied(ctx, env.Client, nodeOverlay) - nodeOverlayController := nodeoverlay.NewController(env.Client, cloudProvider, azureEnv.InstanceTypeStore, cluster) - ExpectReconcileSucceeded(ctx, nodeOverlayController, client.ObjectKeyFromObject(nodeOverlay)) +func validateAKSMachineNodeClaim(nodeClaim *karpv1.NodeClaim, nodePool *karpv1.NodePool) { + // Common validations + validateNodeClaimCommon(nodeClaim, nodePool) + + // AKS-specific annotations + Expect(nodeClaim.Annotations).To(HaveKey(v1beta1.AnnotationAKSMachineResourceID)) + Expect(nodeClaim.Annotations[v1beta1.AnnotationAKSMachineResourceID]).ToNot(BeEmpty()) } -type nodeOverlayCapacityTestOptions struct { - validateNodeClaim func(*karpv1.NodeClaim) - resetCreateCalls func() - expectCreateCalls func() +type provisionModeKind string + +const ( + provisionModeKindScriptless provisionModeKind = "scriptless" + provisionModeKindBootstrappingClient provisionModeKind = "bootstrappingClient" + provisionModeKindAKSMachineAPI provisionModeKind = "aksMachineAPI" + provisionModeKindAKSMachineAPIHeaderBatch provisionModeKind = "aksMachineAPIHeaderBatch" +) + +type provisionModeTestCase struct { + name string + kind provisionModeKind + validateNodeClaim func(*karpv1.NodeClaim) + resetCreateCalls func() + expectCreateCalls func() + expectCreatedResource func() + resetListCalls func() + expectListCalls func() + resetGetCalls func() + expectGetCalls func() + resetDeleteCalls func() + expectDeleteCalls func() } -func runNodeOverlayCapacityTests(testOptions nodeOverlayCapacityTestOptions) { - Context("NodeOverlay", func() { - It("should launch a NodeClaim that requests capacity added by a NodeOverlay", func() { - ctx = coreoptions.ToContext(ctx, coretest.Options(coretest.OptionsFields{ - FeatureGates: coretest.FeatureGates{NodeOverlay: lo.ToPtr(true)}, - })) - customResource := v1.ResourceName("example.com/dongle") - overlayCapacity := resource.MustParse("100") - nodeClaim.Spec.Resources.Requests = v1.ResourceList{customResource: resource.MustParse("1")} - - ExpectApplied(ctx, env.Client, nodeClass, nodePool, nodeClaim) - reconcileCapacityOverlay(customResource, overlayCapacity) - - if testOptions.resetCreateCalls != nil { - testOptions.resetCreateCalls() - } - cloudProviderMachine, err := CreateAndWaitForPromises(ctx, cloudProvider, azureEnv, nodeClaim) - Expect(err).ToNot(HaveOccurred()) - Expect(cloudProviderMachine).ToNot(BeNil()) - if testOptions.validateNodeClaim != nil { - testOptions.validateNodeClaim(cloudProviderMachine) - } - if testOptions.expectCreateCalls != nil { - testOptions.expectCreateCalls() - } - capacity, ok := cloudProviderMachine.Status.Capacity[customResource] - Expect(ok).To(BeTrue()) - Expect(capacity.Cmp(overlayCapacity)).To(Equal(0)) - allocatable, ok := cloudProviderMachine.Status.Allocatable[customResource] - Expect(ok).To(BeTrue()) - Expect(allocatable.Cmp(overlayCapacity)).To(Equal(0)) - }) - - It("should not use overlaid capacity when NodeOverlay is disabled", func() { - // Explicitly disable the NodeOverlay feature gate so this test does not - // depend on ordering with the previous It block that enables it. - ctx = coreoptions.ToContext(ctx, coretest.Options(coretest.OptionsFields{ - FeatureGates: coretest.FeatureGates{NodeOverlay: lo.ToPtr(false)}, - })) - customResource := v1.ResourceName("example.com/dongle") - overlayCapacity := resource.MustParse("100") - nodeClaim.Spec.Resources.Requests = v1.ResourceList{customResource: resource.MustParse("1")} - - ExpectApplied(ctx, env.Client, nodeClass, nodePool, nodeClaim) - reconcileCapacityOverlay(customResource, overlayCapacity) - - if testOptions.resetCreateCalls != nil { - testOptions.resetCreateCalls() - } - cloudProviderMachine, err := CreateAndWaitForPromises(ctx, cloudProvider, azureEnv, nodeClaim) - Expect(corecloudprovider.IsInsufficientCapacityError(err)).To(BeTrue()) - Expect(cloudProviderMachine).To(BeNil()) - }) - }) +func (p provisionModeTestCase) isAKSMachineAPIHeaderBatchMode() bool { + return p.kind == provisionModeKindAKSMachineAPIHeaderBatch +} + +func (p provisionModeTestCase) isAKSMachineMode() bool { + switch p.kind { + case provisionModeKindAKSMachineAPI, provisionModeKindAKSMachineAPIHeaderBatch: + return true + case provisionModeKindScriptless, provisionModeKindBootstrappingClient: + return false + default: + Fail(fmt.Sprintf("unknown provision mode kind %q for %q", p.kind, p.name)) + return false + } } -func vmNodeOverlayCapacityTestOptions() nodeOverlayCapacityTestOptions { - return nodeOverlayCapacityTestOptions{ +func aksscriptlessProvisionMode() provisionModeTestCase { + return provisionModeTestCase{ + name: "AKSScriptless", + kind: provisionModeKindScriptless, validateNodeClaim: func(nodeClaim *karpv1.NodeClaim) { validateVMNodeClaim(nodeClaim, nodePool) }, @@ -272,217 +242,83 @@ func vmNodeOverlayCapacityTestOptions() nodeOverlayCapacityTestOptions { Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(0)) }, + expectCreatedResource: func() { + createInput := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + Expect(createInput.VM.Properties).ToNot(BeNil()) + }, + resetListCalls: func() { + azureEnv.AKSMachinesAPI.AKSMachineNewListPagerBehavior.CalledWithInput.Reset() + azureEnv.AzureResourceGraphAPI.AzureResourceGraphResourcesBehavior.CalledWithInput.Reset() + }, + expectListCalls: func() { + if testOptions.ManageExistingAKSMachines { + Expect(azureEnv.AKSMachinesAPI.AKSMachineNewListPagerBehavior.CalledWithInput.Len()).To(Equal(1)) + } else { + Expect(azureEnv.AKSMachinesAPI.AKSMachineNewListPagerBehavior.CalledWithInput.Len()).To(Equal(0)) + } + Expect(azureEnv.AzureResourceGraphAPI.AzureResourceGraphResourcesBehavior.CalledWithInput.Len()).To(Equal(1)) + }, + resetGetCalls: func() { + azureEnv.VirtualMachinesAPI.VirtualMachineGetBehavior.CalledWithInput.Reset() + azureEnv.AKSMachinesAPI.AKSMachineGetBehavior.CalledWithInput.Reset() + }, + expectGetCalls: func() { + Expect(azureEnv.VirtualMachinesAPI.VirtualMachineGetBehavior.CalledWithInput.Len()).To(Equal(1)) + Expect(azureEnv.AKSMachinesAPI.AKSMachineGetBehavior.CalledWithInput.Len()).To(Equal(0)) + }, + resetDeleteCalls: func() { + azureEnv.VirtualMachinesAPI.VirtualMachineDeleteBehavior.CalledWithInput.Reset() + azureEnv.AKSAgentPoolsAPI.AgentPoolDeleteMachinesBehavior.CalledWithInput.Reset() + }, + expectDeleteCalls: func() { + Expect(azureEnv.VirtualMachinesAPI.VirtualMachineDeleteBehavior.CalledWithInput.Len()).To(Equal(1)) + Expect(azureEnv.AKSAgentPoolsAPI.AgentPoolDeleteMachinesBehavior.CalledWithInput.Len()).To(Equal(0)) + }, } } -var _ = Describe("CloudProvider", func() { - // Attention: tests under "ProvisionMode = AKSScriptless" are not applicable to ProvisionMode = AKSMachineAPI option. - // Due to different assumptions, not all tests can be shared. Add tests for AKS machine instances in a different Context/file. - // If ProvisionMode = AKSScriptless is no longer supported, their code/tests will be replaced with ProvisionMode = AKSMachineAPI. - Context("ProvisionMode = AKSScriptless, ManageExistingAKSMachines = false", func() { - BeforeEach(func() { - testOptions = test.Options(test.OptionsFields{ - ProvisionMode: lo.ToPtr(consts.ProvisionModeAKSScriptless), - ManageExistingAKSMachines: lo.ToPtr(false), - }) - ctx = coreoptions.ToContext(ctx, coretest.Options()) - ctx = options.ToContext(ctx, testOptions) - - azureEnv = test.NewEnvironment(ctx, env) - test.ApplyDefaultStatus(nodeClass, env, testOptions.UseSIG) - cloudProvider = New(azureEnv.InstanceTypesProvider, azureEnv.VMInstanceProvider, azureEnv.AKSMachineProvider, recorder, env.Client, azureEnv.ImageProvider, azureEnv.InstanceTypeStore) - - cluster = state.NewCluster(fakeClock, env.Client, cloudProvider) - coreProvisioner = provisioning.NewProvisioner(env.Client, recorder, cloudProvider, cluster, fakeClock) - }) - - AfterEach(func() { - // Wait for any async polling goroutines to complete before resetting - cloudProvider.WaitForInstancePromises() - cluster.Reset() - azureEnv.Reset(ctx) - }) - - It("should list nodeclaim created by the CloudProvider", func() { - ExpectApplied(ctx, env.Client, nodeClass, nodePool) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - - nodeClaims, _ := cloudProvider.List(ctx) - Expect(azureEnv.AKSMachinesAPI.AKSMachineNewListPagerBehavior.CalledWithInput.Len()).To(Equal(0)) - Expect(azureEnv.AzureResourceGraphAPI.AzureResourceGraphResourcesBehavior.CalledWithInput.Len()).To(Equal(1)) - queryRequest := azureEnv.AzureResourceGraphAPI.AzureResourceGraphResourcesBehavior.CalledWithInput.Pop().Query - Expect(*queryRequest.Query).To(Equal(instance.GetVMListQueryBuilder(azureEnv.AzureResourceGraphAPI.ResourceGroup).String())) - Expect(nodeClaims).To(HaveLen(1)) - validateVMNodeClaim(nodeClaims[0], nodePool) - resp, _ := azureEnv.VirtualMachinesAPI.Get(ctx, azureEnv.AzureResourceGraphAPI.ResourceGroup, nodeClaims[0].Name, nil) - Expect(resp.VirtualMachine).ToNot(BeNil()) - }) - It("should list nodeclaim with correct instance type even after capacity error marks offerings unavailable", func() { - ExpectApplied(ctx, env.Client, nodeClass, nodePool) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - - // Get the instance type from the created VM - vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM - vmSize := string(lo.FromPtr(vm.Properties.HardwareProfile.VMSize)) - Expect(vmSize).ToNot(BeEmpty()) - - // Simulate a capacity error by marking all offerings for this instance type as unavailable - for _, zone := range azureEnv.Zones() { - azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "ZonalAllocationFailure", fake.MakeSKU(vmSize), zone, karpv1.CapacityTypeOnDemand) - azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "ZonalAllocationFailure", fake.MakeSKU(vmSize), zone, karpv1.CapacityTypeSpot) - } - - // List should still return the nodeclaim with the correct instance type - nodeClaims, err := cloudProvider.List(ctx) - Expect(err).ToNot(HaveOccurred()) - Expect(nodeClaims).To(HaveLen(1)) - validateVMNodeClaim(nodeClaims[0], nodePool) - Expect(nodeClaims[0].Labels[v1.LabelInstanceTypeStable]).To(Equal(vmSize)) - }) - It("should return an ICE error when there are no instance types to launch", func() { - // Specify no instance types and expect to receive a capacity error - nodeClaim.Spec.Requirements = []karpv1.NodeSelectorRequirementWithMinValues{ - { - Key: v1.LabelInstanceTypeStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{"doesnotexist"}, // will not match any instance types - }, - } - - ExpectApplied(ctx, env.Client, nodePool, nodeClass, nodeClaim) - cloudProviderMachine, err := CreateAndWaitForPromises(ctx, cloudProvider, azureEnv, nodeClaim) - Expect(corecloudprovider.IsInsufficientCapacityError(err)).To(BeTrue()) - Expect(cloudProviderMachine).To(BeNil()) - }) - - runNodeOverlayCapacityTests(vmNodeOverlayCapacityTestOptions()) - - Context("AKS Machine API integration", func() { - It("should not call writes to AKS Machine API", func() { - ExpectApplied(ctx, env.Client, nodeClass, nodePool) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(0)) - }) - - Context("AKS Machines Pool Management", func() { - It("should handle AKS machines pool not found on each CloudProvider operation", func() { - // First create a successful VM - ExpectApplied(ctx, env.Client, nodeClass, nodePool) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - - // cloudprovider.List should return vm nodeclaim - nodeClaims, err := cloudProvider.List(ctx) - Expect(err).ToNot(HaveOccurred()) - Expect(nodeClaims).To(HaveLen(1)) - validateVMNodeClaim(nodeClaims[0], nodePool) - - // cloudprovider.Delete should be fine also - err = cloudProvider.Delete(ctx, nodeClaims[0]) - Expect(err).ToNot(HaveOccurred()) - }) - }) - }) - }) - - Context("ProvisionMode = AKSScriptless, ManageExistingAKSMachines = true", func() { - BeforeEach(func() { - testOptions = test.Options(test.OptionsFields{ - ProvisionMode: lo.ToPtr(consts.ProvisionModeAKSScriptless), - ManageExistingAKSMachines: lo.ToPtr(true), - }) - ctx = coreoptions.ToContext(ctx, coretest.Options()) - ctx = options.ToContext(ctx, testOptions) - - azureEnv = test.NewEnvironment(ctx, env) - test.ApplyDefaultStatus(nodeClass, env, testOptions.UseSIG) - cloudProvider = New(azureEnv.InstanceTypesProvider, azureEnv.VMInstanceProvider, azureEnv.AKSMachineProvider, recorder, env.Client, azureEnv.ImageProvider, azureEnv.InstanceTypeStore) - - cluster = state.NewCluster(fakeClock, env.Client, cloudProvider) - coreProvisioner = provisioning.NewProvisioner(env.Client, recorder, cloudProvider, cluster, fakeClock) - }) - - AfterEach(func() { - // Wait for any async polling goroutines to complete before resetting - cloudProvider.WaitForInstancePromises() - cluster.Reset() - azureEnv.Reset(ctx) - }) - - It("should list nodeclaim created by the CloudProvider", func() { - ExpectApplied(ctx, env.Client, nodeClass, nodePool) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - - nodeClaims, _ := cloudProvider.List(ctx) - Expect(azureEnv.AKSMachinesAPI.AKSMachineNewListPagerBehavior.CalledWithInput.Len()).To(Equal(1)) // Expect to be called in case of existing AKS machines +func aksMachineAPIHeaderBatchProvisionMode() provisionModeTestCase { + return provisionModeTestCase{ + name: "AKSMachineAPIHeaderBatch", + kind: provisionModeKindAKSMachineAPIHeaderBatch, + validateNodeClaim: func(nodeClaim *karpv1.NodeClaim) { + validateAKSMachineNodeClaim(nodeClaim, nodePool) + }, + resetCreateCalls: func() { + azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Reset() + azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Reset() + }, + expectCreateCalls: func() { + Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(0)) + }, + expectCreatedResource: func() { + createInput := azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Pop() + Expect(createInput.AKSMachine.Properties).ToNot(BeNil()) + }, + resetListCalls: func() { + azureEnv.AKSMachinesAPI.AKSMachineNewListPagerBehavior.CalledWithInput.Reset() + azureEnv.AzureResourceGraphAPI.AzureResourceGraphResourcesBehavior.CalledWithInput.Reset() + }, + expectListCalls: func() { + Expect(azureEnv.AKSMachinesAPI.AKSMachineNewListPagerBehavior.CalledWithInput.Len()).To(Equal(1)) Expect(azureEnv.AzureResourceGraphAPI.AzureResourceGraphResourcesBehavior.CalledWithInput.Len()).To(Equal(1)) - queryRequest := azureEnv.AzureResourceGraphAPI.AzureResourceGraphResourcesBehavior.CalledWithInput.Pop().Query - Expect(*queryRequest.Query).To(Equal(instance.GetVMListQueryBuilder(azureEnv.AzureResourceGraphAPI.ResourceGroup).String())) - Expect(nodeClaims).To(HaveLen(1)) - validateVMNodeClaim(nodeClaims[0], nodePool) - resp, _ := azureEnv.VirtualMachinesAPI.Get(ctx, azureEnv.AzureResourceGraphAPI.ResourceGroup, nodeClaims[0].Name, nil) - Expect(resp.VirtualMachine).ToNot(BeNil()) - }) - It("should return an ICE error when there are no instance types to launch", func() { - // Specify no instance types and expect to receive a capacity error - nodeClaim.Spec.Requirements = []karpv1.NodeSelectorRequirementWithMinValues{ - { - Key: v1.LabelInstanceTypeStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{"doesnotexist"}, // will not match any instance types - }, - } - - ExpectApplied(ctx, env.Client, nodePool, nodeClass, nodeClaim) - cloudProviderMachine, err := CreateAndWaitForPromises(ctx, cloudProvider, azureEnv, nodeClaim) - Expect(corecloudprovider.IsInsufficientCapacityError(err)).To(BeTrue()) - Expect(cloudProviderMachine).To(BeNil()) - }) - - runNodeOverlayCapacityTests(vmNodeOverlayCapacityTestOptions()) - - Context("AKS Machine API integration", func() { - It("should not call writes to AKS Machine API", func() { - ExpectApplied(ctx, env.Client, nodeClass, nodePool) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - - Expect(azureEnv.AKSMachinesAPI.AKSMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(0)) - }) - - Context("AKS Machines Pool Management", func() { - It("should handle AKS machines pool not found on each CloudProvider operation", func() { - // First create a successful VM - ExpectApplied(ctx, env.Client, nodeClass, nodePool) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - - // cloudprovider.List should return vm nodeclaim - nodeClaims, err := cloudProvider.List(ctx) - Expect(err).ToNot(HaveOccurred()) - Expect(nodeClaims).To(HaveLen(1)) - validateVMNodeClaim(nodeClaims[0], nodePool) - - // cloudprovider.Delete should be fine also - err = cloudProvider.Delete(ctx, nodeClaims[0]) - Expect(err).ToNot(HaveOccurred()) - }) - }) - }) - }) -}) + }, + resetGetCalls: func() { + azureEnv.AKSMachinesAPI.AKSMachineGetBehavior.CalledWithInput.Reset() + azureEnv.VirtualMachinesAPI.VirtualMachineGetBehavior.CalledWithInput.Reset() + }, + expectGetCalls: func() { + Expect(azureEnv.AKSMachinesAPI.AKSMachineGetBehavior.CalledWithInput.Len()).To(Equal(1)) + Expect(azureEnv.VirtualMachinesAPI.VirtualMachineGetBehavior.CalledWithInput.Len()).To(Equal(0)) + }, + resetDeleteCalls: func() { + azureEnv.AKSAgentPoolsAPI.AgentPoolDeleteMachinesBehavior.CalledWithInput.Reset() + azureEnv.VirtualMachinesAPI.VirtualMachineDeleteBehavior.CalledWithInput.Reset() + }, + expectDeleteCalls: func() { + Expect(azureEnv.AKSAgentPoolsAPI.AgentPoolDeleteMachinesBehavior.CalledWithInput.Len()).To(Equal(1)) + Expect(azureEnv.VirtualMachinesAPI.VirtualMachineDeleteBehavior.CalledWithInput.Len()).To(Equal(0)) + }, + } +} diff --git a/pkg/providers/instancetype/suite_test.go b/pkg/providers/instancetype/suite_test.go index 4b5591fb5..57a424422 100644 --- a/pkg/providers/instancetype/suite_test.go +++ b/pkg/providers/instancetype/suite_test.go @@ -19,25 +19,17 @@ package instancetype_test import ( "bytes" "context" - "encoding/base64" "fmt" "io" - "net/http" - "strconv" "strings" "testing" - "time" "github.com/awslabs/operatorpkg/object" - corestatus "github.com/awslabs/operatorpkg/status" "github.com/blang/semver/v4" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" "github.com/samber/lo" v1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/resource" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/util/sets" "k8s.io/client-go/tools/record" clock "k8s.io/utils/clock/testing" . "sigs.k8s.io/karpenter/pkg/utils/testing" @@ -53,31 +45,18 @@ import ( . "sigs.k8s.io/karpenter/pkg/test/expectations" "sigs.k8s.io/karpenter/pkg/test/v1alpha1" - sdkerrors "github.com/Azure/azure-sdk-for-go-extensions/pkg/errors" - "github.com/Azure/azure-sdk-for-go/profiles/latest/compute/mgmt/compute" - "github.com/Azure/azure-sdk-for-go/sdk/azcore" "github.com/Azure/skewer" - "github.com/Azure/karpenter-provider-azure/pkg/providers/imagefamily" - "github.com/Azure/karpenter-provider-azure/pkg/providers/imagefamily/bootstrap" - "github.com/Azure/karpenter-provider-azure/pkg/providers/instance" - "github.com/Azure/karpenter-provider-azure/pkg/providers/labels" - "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7" "github.com/Azure/karpenter-provider-azure/pkg/apis" "github.com/Azure/karpenter-provider-azure/pkg/apis/v1beta1" "github.com/Azure/karpenter-provider-azure/pkg/cloudprovider" "github.com/Azure/karpenter-provider-azure/pkg/consts" - "github.com/Azure/karpenter-provider-azure/pkg/controllers/nodeclass/status" "github.com/Azure/karpenter-provider-azure/pkg/fake" "github.com/Azure/karpenter-provider-azure/pkg/operator/options" "github.com/Azure/karpenter-provider-azure/pkg/providers/instancetype" - "github.com/Azure/karpenter-provider-azure/pkg/providers/loadbalancer" "github.com/Azure/karpenter-provider-azure/pkg/test" - . "github.com/Azure/karpenter-provider-azure/pkg/test/expectations" - "github.com/Azure/karpenter-provider-azure/pkg/utils" - nodeclaimutils "github.com/Azure/karpenter-provider-azure/pkg/utils/nodeclaim" "github.com/Azure/karpenter-provider-azure/pkg/utils/zones" ) @@ -176,662 +155,13 @@ var _ = Describe("InstanceType Provider", func() { ExpectCleanedUp(ctx, env.Client) }) - Context("ProvisionMode = BootstrappingClient", func() { - // Suggestion: ideally, we want to reuse all tests with just ProvisionMode changed to BootstrappingClient. It needs refactor to allow efficient reuse. - // However, not all tests are applicable. E.g., custom data tests are not useful as it is faked, unlike Scriptless. - It("should provision the node and CSE", func() { - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, clusterBootstrap, cloudProviderBootstrap, coreProvisionerBootstrap, azureEnvBootstrap, pod) - ExpectCSEProvisioned(azureEnvBootstrap) - ExpectScheduled(ctx, env.Client, pod) - }) - It("should not reattempt creation of a vm thats been created before, and also not CSE", func() { - // This test is more like a sanity check of the current intended behavior. The design of the behavior can be changed if intended. - nodeClaim := coretest.NodeClaim(karpv1.NodeClaim{ - ObjectMeta: metav1.ObjectMeta{ - Labels: map[string]string{"karpenter.sh/nodepool": nodePool.Name}, - }, - Spec: karpv1.NodeClaimSpec{NodeClassRef: &karpv1.NodeClassReference{Name: nodeClass.Name}}, - }) - vmName := instance.GenerateResourceName(nodeClaim.Name) - vm := &armcompute.VirtualMachine{ - Name: lo.ToPtr(vmName), - ID: lo.ToPtr(fake.MkVMID(options.FromContext(ctx).NodeResourceGroup, vmName)), - Location: lo.ToPtr(fake.Region), - Zones: []*string{lo.ToPtr("fantasy-zone")}, - Properties: &armcompute.VirtualMachineProperties{ - TimeCreated: lo.ToPtr(time.Now()), - HardwareProfile: &armcompute.HardwareProfile{ - VMSize: lo.ToPtr(armcompute.VirtualMachineSizeTypesBasicA3), - }, - }, - } - azureEnvBootstrap.VirtualMachinesAPI.Instances.Store(lo.FromPtr(vm.ID), *vm) - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - _, err := cloudProviderBootstrap.Create(ctx, nodeClaim) // Async routine can still be ran in the background after this point - Expect(err).ToNot(HaveOccurred()) - - ExpectCSENotProvisioned(azureEnvBootstrap) - }) - }) - - // Attention: tests under "ProvisionMode = AKSScriptless" are not applicable to ProvisionMode = AKSMachineAPI option. - // Due to different assumptions, not all tests can be shared. Add tests for AKS machine instances in a different Context/file. - // If ProvisionMode = AKSScriptless is no longer supported, their code/tests will be replaced with ProvisionMode = AKSMachineAPI. - // - // These tests specifically are added to ProvisionMode = AKSMachineAPI in cloudprovider module to reflect its end-to-end nature. - // Suggestion: move these tests there too(?) - Context("ProvisionMode = AKSScriptless", func() { - Context("Subnet", func() { - It("should use the VNET_SUBNET_ID", func() { - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - nic := azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Pop() - Expect(nic).NotTo(BeNil()) - Expect(lo.FromPtr(nic.Interface.Properties.IPConfigurations[0].Properties.Subnet.ID)).To(Equal("/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resourceGroup/providers/Microsoft.Network/virtualNetworks/aks-vnet-12345678/subnets/aks-subnet")) - }) - It("should produce all required azure cni labels", func() { - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - - decodedString := ExpectDecodedCustomData(azureEnv) - Expect(decodedString).To(SatisfyAll( - ContainSubstring("kubernetes.azure.com/ebpf-dataplane=cilium"), - ContainSubstring("kubernetes.azure.com/network-subnet=aks-subnet"), - ContainSubstring("kubernetes.azure.com/nodenetwork-vnetguid=a519e60a-cac0-40b2-b883-084477fe6f5c"), - ContainSubstring("kubernetes.azure.com/podnetwork-type=overlay"), - ContainSubstring("kubernetes.azure.com/azure-cni-overlay=true"), - )) - }) - It("should include stateless CNI label for kubernetes 1.34+ set to true", func() { - // Set kubernetes version to 1.34.0 - nodeClass.Status.KubernetesVersion = lo.ToPtr("1.34.0") - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - - decodedString := ExpectDecodedCustomData(azureEnv) - Expect(decodedString).To(SatisfyAll( - ContainSubstring("kubernetes.azure.com/network-stateless-cni=true"), - )) - }) - It("should include stateless CNI label for kubernetes < 1.34 set to false", func() { - // Set kubernetes version to 1.33.0 - nodeClass.Status.KubernetesVersion = lo.ToPtr("1.33.0") - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - decodedString := ExpectDecodedCustomData(azureEnv) - Expect(decodedString).To(SatisfyAll( - ContainSubstring("kubernetes.azure.com/network-stateless-cni=false"), - )) - - }) - It("should use the subnet specified in the nodeclass", func() { - nodeClass.Spec.VNETSubnetID = lo.ToPtr("/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/sillygeese/providers/Microsoft.Network/virtualNetworks/karpenter/subnets/nodeclassSubnet") - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - - nic := azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Pop() - Expect(nic).NotTo(BeNil()) - Expect(lo.FromPtr(nic.Interface.Properties.IPConfigurations[0].Properties.Subnet.ID)).To(Equal("/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/sillygeese/providers/Microsoft.Network/virtualNetworks/karpenter/subnets/nodeclassSubnet")) - }) - }) - Context("VM Creation Failures", func() { - It("should not reattempt creation of a vm thats been created before", func() { - nodeClaim := coretest.NodeClaim(karpv1.NodeClaim{ - ObjectMeta: metav1.ObjectMeta{ - Labels: map[string]string{"karpenter.sh/nodepool": nodePool.Name}, - }, - Spec: karpv1.NodeClaimSpec{NodeClassRef: &karpv1.NodeClassReference{Name: nodeClass.Name}}, - }) - vmName := instance.GenerateResourceName(nodeClaim.Name) - vm := &armcompute.VirtualMachine{ - Name: lo.ToPtr(vmName), - ID: lo.ToPtr(fake.MkVMID(options.FromContext(ctx).NodeResourceGroup, vmName)), - Location: lo.ToPtr(fake.Region), - Zones: []*string{lo.ToPtr("fantasy-zone")}, // Makes sure we do not get a match from the existing set of zones - Properties: &armcompute.VirtualMachineProperties{ - TimeCreated: lo.ToPtr(time.Now()), - HardwareProfile: &armcompute.HardwareProfile{ - VMSize: lo.ToPtr(armcompute.VirtualMachineSizeTypesBasicA3), - }, - }, - } - azureEnv.VirtualMachinesAPI.Instances.Store(lo.FromPtr(vm.ID), *vm) - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - _, err := CreateAndWaitForPromises(ctx, cloudProvider, azureEnv, nodeClaim) - Expect(err).ToNot(HaveOccurred()) // Without the GET in instance.CreateVirtualMachine this will fail - }) - It("should delete the network interface on failure to create the vm", func() { - ErrMsg := "test error" - ErrCode := fmt.Sprint(http.StatusNotFound) - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set( - &azcore.ResponseError{ - ErrorCode: ErrCode, - RawResponse: &http.Response{ - Body: createSDKErrorBody(ErrCode, ErrMsg), - }, - }, - ) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectNotScheduled(ctx, env.Client, pod) - - // We should have created a nic for the vm - Expect(azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - // The nic we used in the vm create, should be cleaned up if the vm call fails - nic := azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Pop() - Expect(nic).NotTo(BeNil()) - _, ok := azureEnv.NetworkInterfacesAPI.NetworkInterfaces.Load(lo.FromPtr(nic.Interface.ID)) - Expect(ok).To(Equal(false)) - - azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set(nil) - pod = coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - }) - It("should fail to provision when LowPriorityCoresQuota errors are hit, then switch capacity type and succeed", func() { - LowPriorityCoresQuotaErrorMessage := "Operation could not be completed as it results in exceeding approved Low Priority Cores quota. Additional details - Deployment Model: Resource Manager, Location: westus2, Current Limit: 0, Current Usage: 0, Additional Required: 32, (Minimum) New Limit Required: 32. Submit a request for Quota increase at https://aka.ms/ProdportalCRP/#blade/Microsoft_Azure_Capacity/UsageAndQuota.ReactView/Parameters/%7B%22subscriptionId%22:%(redacted)%22,%22command%22:%22openQuotaApprovalBlade%22,%22quotas%22:[%7B%22location%22:%22westus2%22,%22providerId%22:%22Microsoft.Compute%22,%22resourceName%22:%22LowPriorityCores%22,%22quotaRequest%22:%7B%22properties%22:%7B%22limit%22:32,%22unit%22:%22Count%22,%22name%22:%7B%22value%22:%22LowPriorityCores%22%7D%7D%7D%7D]%7D by specifying parameters listed in the ‘Details’ section for deployment to succeed. Please read more about quota limits at https://docs.microsoft.com/en-us/azure/azure-supportability/per-vm-quota-requests" - // Create nodepool that has both ondemand and spot capacity types enabled - coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ - Key: karpv1.CapacityTypeLabelKey, - Operator: v1.NodeSelectorOpIn, - Values: []string{karpv1.CapacityTypeOnDemand, karpv1.CapacityTypeSpot}}) - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - // Set the LowPriorityCoresQuota error to be returned when creating the vm - azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set( - &azcore.ResponseError{ - ErrorCode: sdkerrors.OperationNotAllowed, - RawResponse: &http.Response{ - Body: createSDKErrorBody(sdkerrors.OperationNotAllowed, LowPriorityCoresQuotaErrorMessage), - }, - }, - ) - // Create a pod that should fail to schedule - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectNotScheduled(ctx, env.Client, pod) - azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set(nil) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - - // Expect that on-demand nodes are selected if spot capacity is unavailable, and the nodepool uses both spot + on-demand - nodes, err := env.KubernetesInterface.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) - Expect(err).ToNot(HaveOccurred()) - Expect(len(nodes.Items)).To(Equal(1)) - Expect(nodes.Items[0].Labels[karpv1.CapacityTypeLabelKey]).To(Equal(karpv1.CapacityTypeOnDemand)) - }) - - It("should fail to provision when OverconstrainedZonalAllocation errors are hit, then switch zone and succeed", func() { - OverconstrainedZonalAllocationErrorMessage := "Allocation failed. VM(s) with the following constraints cannot be allocated, because the condition is too restrictive. Please remove some constraints and try again." - // Create nodepool that has both ondemand and spot capacity types enabled - coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ - Key: karpv1.CapacityTypeLabelKey, - Operator: v1.NodeSelectorOpIn, - Values: []string{karpv1.CapacityTypeOnDemand, karpv1.CapacityTypeSpot}}) - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - - // Set the OverconstrainedZonalAllocation error to be returned when creating the vm - azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set( - &azcore.ResponseError{ - ErrorCode: sdkerrors.OverconstrainedZonalAllocationRequest, - RawResponse: &http.Response{ - Body: createSDKErrorBody(sdkerrors.OverconstrainedZonalAllocationRequest, OverconstrainedZonalAllocationErrorMessage), - }, - }, - ) - - // Create a pod that should fail to schedule - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectNotScheduled(ctx, env.Client, pod) - - // ensure that initial zone was made unavailable - vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM - initialVMSize := string(*vm.Properties.HardwareProfile.VMSize) - initialCapacityType := instance.GetCapacityTypeFromVM(&vm) - zone, err := zones.MakeAKSLabelZoneFromVM(&vm) - Expect(err).ToNot(HaveOccurred()) - ExpectUnavailable(azureEnv, fake.MakeSKU(initialVMSize), zone, karpv1.CapacityTypeSpot) - - azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set(nil) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - node := ExpectScheduled(ctx, env.Client, pod) - Expect(node.Labels[v1.LabelInstanceTypeStable]).To(Equal(initialVMSize)) - Expect(node.Labels[karpv1.CapacityTypeLabelKey]).To(Equal(initialCapacityType)) - Expect(node.Labels[v1.LabelTopologyZone]).ToNot(Equal(zone)) - Expect(node.Labels).To(HaveKeyWithValue(v1beta1.LabelPlacementScope, v1beta1.PlacementScopeZonal)) - }) - - It("should fail to provision when OverconstrainedAllocation errors are hit, then switch capacity type and succeed", func() { - OverconstrainedAllocationErrorMessage := "Allocation failed. VM(s) with the following constraints cannot be allocated, because the condition is too restrictive." - // Create nodepool that has both ondemand and spot capacity types enabled - coretest.ReplaceRequirements(nodePool, - karpv1.NodeSelectorRequirementWithMinValues{ - Key: karpv1.CapacityTypeLabelKey, - Operator: v1.NodeSelectorOpIn, - Values: []string{karpv1.CapacityTypeOnDemand, karpv1.CapacityTypeSpot}, - }, - karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1beta1.LabelPlacementScope, - Operator: v1.NodeSelectorOpIn, - Values: []string{v1beta1.PlacementScopeZonal}, - }, - ) - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - - // Set the OverconstrainedAllocationError error to be returned when creating the vm - azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set( - &azcore.ResponseError{ - ErrorCode: sdkerrors.OverconstrainedAllocationRequest, - RawResponse: &http.Response{ - Body: createSDKErrorBody(sdkerrors.OverconstrainedAllocationRequest, OverconstrainedAllocationErrorMessage), - }, - }, - ) - - // Create a pod that should fail to schedule - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectNotScheduled(ctx, env.Client, pod) - - vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM - initialVMSize := string(*vm.Properties.HardwareProfile.VMSize) - initialCapacityType := instance.GetCapacityTypeFromVM(&vm) - _, err := zones.MakeAKSLabelZoneFromVM(&vm) - Expect(err).ToNot(HaveOccurred()) - - azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set(nil) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - node := ExpectScheduled(ctx, env.Client, pod) - Expect(node.Labels[v1.LabelInstanceTypeStable]).To(Equal(initialVMSize)) - Expect(node.Labels[karpv1.CapacityTypeLabelKey]).ToNot(Equal(initialCapacityType)) - Expect(node.Labels[karpv1.CapacityTypeLabelKey]).To(Equal(karpv1.CapacityTypeOnDemand)) - Expect(node.Labels).To(HaveKeyWithValue(v1beta1.LabelPlacementScope, v1beta1.PlacementScopeZonal)) - }) - - It("should fail to provision when AllocationFailure errors are hit, then switch placement and succeed", func() { - // Create nodepool that has both ondemand and spot capacity types enabled - coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1.LabelInstanceTypeStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{"Standard_D2_v3", "Standard_D64s_v3"}}) - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - - // Set the AllocationFailed error to be returned when creating the vm - azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set( - &azcore.ResponseError{ - ErrorCode: sdkerrors.AllocationFailed, - RawResponse: &http.Response{ - Body: createSDKErrorBody(sdkerrors.AllocationFailed, "Allocation failed. We do not have sufficient capacity for the requested VM size in this region."), - }, - }, - ) - - // Create a pod that should fail to schedule - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectNotScheduled(ctx, env.Client, pod) - - // ensure that initial VM size was made unavailable - vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM - initialVMSize := *vm.Properties.HardwareProfile.VMSize - zone, err := zones.MakeAKSLabelZoneFromVM(&vm) - Expect(err).ToNot(HaveOccurred()) - ExpectUnavailable(azureEnv, fake.MakeSKU(string(initialVMSize)), zone, karpv1.CapacityTypeSpot) - - azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set(nil) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - node := ExpectScheduled(ctx, env.Client, pod) - Expect(node.Labels[v1.LabelInstanceTypeStable]).To(Equal(string(initialVMSize))) - Expect(node.Labels[v1.LabelTopologyZone]).To(Equal(zones.Regional)) - }) - - It("should fail to provision when AllocationFailure errors are hit and regional placement is unavailable", func() { - coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1.LabelInstanceTypeStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{"Standard_D2_v3"}}) - sku := fake.MakeSKU("Standard_D2_v3") - azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "RegionalUnavailable", sku, zones.Regional, karpv1.CapacityTypeSpot) - azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "RegionalUnavailable", sku, zones.Regional, karpv1.CapacityTypeOnDemand) - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - - azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set( - &azcore.ResponseError{ - ErrorCode: sdkerrors.AllocationFailed, - RawResponse: &http.Response{ - Body: createSDKErrorBody(sdkerrors.AllocationFailed, "Allocation failed. We do not have sufficient capacity for the requested VM size in this region."), - }, - }, - ) - - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectNotScheduled(ctx, env.Client, pod) - - vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM - zone, err := zones.MakeAKSLabelZoneFromVM(&vm) - Expect(err).ToNot(HaveOccurred()) - ExpectUnavailable(azureEnv, sku, zone, karpv1.CapacityTypeSpot) - ExpectUnavailable(azureEnv, sku, zone, karpv1.CapacityTypeOnDemand) - ExpectUnavailable(azureEnv, sku, zones.Regional, karpv1.CapacityTypeSpot) - ExpectUnavailable(azureEnv, sku, zones.Regional, karpv1.CapacityTypeOnDemand) - - azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set(nil) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectNotScheduled(ctx, env.Client, pod) - Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(0)) - }) - - It("should fail to provision when VM SKU family vCPU quota exceeded error is returned, and succeed when it is gone", func() { - familyVCPUQuotaExceededErrorMessage := "Operation could not be completed as it results in exceeding approved standardDLSv5Family Cores quota. Additional details - Deployment Model: Resource Manager, Location: westus2, Current Limit: 100, Current Usage: 96, Additional Required: 32, (Minimum) New Limit Required: 128. Submit a request for Quota increase at https://aka.ms/ProdportalCRP/#blade/Microsoft_Azure_Capacity/UsageAndQuota.ReactView/Parameters/%7B%22subscriptionId%22:%(redacted)%22,%22command%22:%22openQuotaApprovalBlade%22,%22quotas%22:[%7B%22location%22:%22westus2%22,%22providerId%22:%22Microsoft.Compute%22,%22resourceName%22:%22standardDLSv5Family%22,%22quotaRequest%22:%7B%22properties%22:%7B%22limit%22:128,%22unit%22:%22Count%22,%22name%22:%7B%22value%22:%22standardDLSv5Family%22%7D%7D%7D%7D]%7D by specifying parameters listed in the ‘Details’ section for deployment to succeed. Please read more about quota limits at https://docs.microsoft.com/en-us/azure/azure-supportability/per-vm-quota-requests" - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set( - &azcore.ResponseError{ - ErrorCode: sdkerrors.OperationNotAllowed, - RawResponse: &http.Response{ - Body: createSDKErrorBody(sdkerrors.OperationNotAllowed, familyVCPUQuotaExceededErrorMessage), - }, - }, - ) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectNotScheduled(ctx, env.Client, pod) - - // We should have created a nic for the vm - Expect(azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - // The nic we used in the vm create, should be cleaned up if the vm call fails - nic := azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Pop() - Expect(nic).NotTo(BeNil()) - _, ok := azureEnv.NetworkInterfacesAPI.NetworkInterfaces.Load(lo.FromPtr(nic.Interface.ID)) - Expect(ok).To(Equal(false)) - - azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set(nil) - pod = coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - }) - It("should fail to provision when VM SKU family vCPU quota limit is zero, and succeed when its gone", func() { - familyVCPUQuotaIsZeroErrorMessage := "Operation could not be completed as it results in exceeding approved standardDLSv5Family Cores quota. Additional details - Deployment Model: Resource Manager, Location: westus2, Current Limit: 0, Current Usage: 0, Additional Required: 32, (Minimum) New Limit Required: 32. Submit a request for Quota increase at https://aka.ms/ProdportalCRP/#blade/Microsoft_Azure_Capacity/UsageAndQuota.ReactView/Parameters/%7B%22subscriptionId%22:%(redacted)%22,%22command%22:%22openQuotaApprovalBlade%22,%22quotas%22:[%7B%22location%22:%22westus2%22,%22providerId%22:%22Microsoft.Compute%22,%22resourceName%22:%22standardDLSv5Family%22,%22quotaRequest%22:%7B%22properties%22:%7B%22limit%22:128,%22unit%22:%22Count%22,%22name%22:%7B%22value%22:%22standardDLSv5Family%22%7D%7D%7D%7D]%7D by specifying parameters listed in the ‘Details’ section for deployment to succeed. Please read more about quota limits at https://docs.microsoft.com/en-us/azure/azure-supportability/per-vm-quota-requests" - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set( - &azcore.ResponseError{ - ErrorCode: sdkerrors.OperationNotAllowed, - RawResponse: &http.Response{ - Body: createSDKErrorBody(sdkerrors.OperationNotAllowed, familyVCPUQuotaIsZeroErrorMessage), - }, - }, - ) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectNotScheduled(ctx, env.Client, pod) - // We should have created a nic for the vm - Expect(azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - // The nic we used in the vm create, should be cleaned up if the vm call fails - nic := azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Pop() - Expect(nic).NotTo(BeNil()) - _, ok := azureEnv.NetworkInterfacesAPI.NetworkInterfaces.Load(lo.FromPtr(nic.Interface.ID)) - Expect(ok).To(Equal(false)) - - azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set(nil) - pod = coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - }) - - It("should return ICE if Total Regional Cores Quota errors are hit", func() { - regionalVCPUQuotaExceededErrorMessage := "Operation could not be completed as it results in exceeding approved Total Regional Cores quota. Additional details - Deployment Model: Resource Manager, Location: uksouth, Current Limit: 100, Current Usage: 100, Additional Required: 64, (Minimum) New Limit Required: 164. Submit a request for Quota increase at https://aka.ms/ProdportalCRP/#blade/Microsoft_Azure_Capacity/UsageAndQuota.ReactView/Parameters/%7B%22subscriptionId%22:%(redacted)%22,%22command%22:%22openQuotaApprovalBlade%22,%22quotas%22:[%7B%22location%22:%22uksouth%22,%22providerId%22:%22Microsoft.Compute%22,%22resourceName%22:%22cores%22,%22quotaRequest%22:%7B%22properties%22:%7B%22limit%22:164,%22unit%22:%22Count%22,%22name%22:%7B%22value%22:%22cores%22%7D%7D%7D%7D]%7D by specifying parameters listed in the ‘Details’ section for deployment to succeed. Please read more about quota limits at https://docs.microsoft.com/en-us/azure/azure-supportability/regional-quota-requests" - azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set( - &azcore.ResponseError{ - ErrorCode: sdkerrors.OperationNotAllowed, - RawResponse: &http.Response{ - Body: createSDKErrorBody(sdkerrors.OperationNotAllowed, regionalVCPUQuotaExceededErrorMessage), - }, - }, - ) - - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - nodeClaim := coretest.NodeClaim(karpv1.NodeClaim{ - ObjectMeta: metav1.ObjectMeta{ - Labels: map[string]string{ - karpv1.NodePoolLabelKey: nodePool.Name, - }, - }, - Spec: karpv1.NodeClaimSpec{ - NodeClassRef: &karpv1.NodeClassReference{ - Name: nodeClass.Name, - Group: object.GVK(nodeClass).Group, - Kind: object.GVK(nodeClass).Kind, - }, - }, - }) - claim, err := CreateAndWaitForPromises(ctx, cloudProvider, azureEnv, nodeClaim) - Expect(corecloudprovider.IsInsufficientCapacityError(err)).To(BeTrue()) - Expect(claim).To(BeNil()) - - }) - }) - - Context("additional-tags", func() { - It("should add additional tags to the node", func() { - ctx = options.ToContext(ctx, test.Options(test.OptionsFields{ - AdditionalTags: map[string]string{ - "karpenter.azure.com/test-tag": "test-value", - }, - })) - - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - - vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM - Expect(vm).NotTo(BeNil()) - Expect(vm.Tags).To(Equal(map[string]*string{ - "karpenter.azure.com_test-tag": lo.ToPtr("test-value"), - "karpenter.azure.com_cluster": lo.ToPtr("test-cluster"), - "compute.aks.billing": lo.ToPtr("linux"), - "karpenter.sh_nodepool": lo.ToPtr(nodePool.Name), - })) - - nic := azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Pop() - Expect(nic).NotTo(BeNil()) - Expect(nic.Interface.Tags).To(Equal(map[string]*string{ - "karpenter.azure.com_test-tag": lo.ToPtr("test-value"), - "karpenter.azure.com_cluster": lo.ToPtr("test-cluster"), - "compute.aks.billing": lo.ToPtr("linux"), - "karpenter.sh_nodepool": lo.ToPtr(nodePool.Name), - })) - }) - }) - - DescribeTable("Filtering by LocalDNS", - func(localDNSMode v1beta1.LocalDNSMode, k8sVersion string, shouldIncludeD2s, shouldIncludeD4s bool) { - if localDNSMode != "" { - // Create complete LocalDNS configuration with all required fields - // Note: VnetDNS and KubeDNS overrides must contain both "." and "cluster.local" zones - nodeClass.Spec.LocalDNS = &v1beta1.LocalDNS{ - Mode: localDNSMode, - VnetDNSOverrides: []v1beta1.LocalDNSZoneOverride{ - { - Zone: ".", - QueryLogging: v1beta1.LocalDNSQueryLoggingError, - Protocol: v1beta1.LocalDNSProtocolPreferUDP, - ForwardDestination: v1beta1.LocalDNSForwardDestinationVnetDNS, - ForwardPolicy: v1beta1.LocalDNSForwardPolicySequential, - MaxConcurrent: lo.ToPtr(int32(100)), - CacheDuration: karpv1.MustParseNillableDuration("1h"), - ServeStaleDuration: karpv1.MustParseNillableDuration("30m"), - ServeStale: v1beta1.LocalDNSServeStaleVerify, - }, - { - Zone: "cluster.local", - QueryLogging: v1beta1.LocalDNSQueryLoggingError, - Protocol: v1beta1.LocalDNSProtocolPreferUDP, - ForwardDestination: v1beta1.LocalDNSForwardDestinationClusterCoreDNS, - ForwardPolicy: v1beta1.LocalDNSForwardPolicySequential, - MaxConcurrent: lo.ToPtr(int32(100)), - CacheDuration: karpv1.MustParseNillableDuration("1h"), - ServeStaleDuration: karpv1.MustParseNillableDuration("30m"), - ServeStale: v1beta1.LocalDNSServeStaleVerify, - }, - }, - KubeDNSOverrides: []v1beta1.LocalDNSZoneOverride{ - { - Zone: ".", - QueryLogging: v1beta1.LocalDNSQueryLoggingError, - Protocol: v1beta1.LocalDNSProtocolPreferUDP, - ForwardDestination: v1beta1.LocalDNSForwardDestinationClusterCoreDNS, - ForwardPolicy: v1beta1.LocalDNSForwardPolicySequential, - MaxConcurrent: lo.ToPtr(int32(100)), - CacheDuration: karpv1.MustParseNillableDuration("1h"), - ServeStaleDuration: karpv1.MustParseNillableDuration("30m"), - ServeStale: v1beta1.LocalDNSServeStaleVerify, - }, - { - Zone: "cluster.local", - QueryLogging: v1beta1.LocalDNSQueryLoggingError, - Protocol: v1beta1.LocalDNSProtocolPreferUDP, - ForwardDestination: v1beta1.LocalDNSForwardDestinationClusterCoreDNS, - ForwardPolicy: v1beta1.LocalDNSForwardPolicySequential, - MaxConcurrent: lo.ToPtr(int32(100)), - CacheDuration: karpv1.MustParseNillableDuration("1h"), - ServeStaleDuration: karpv1.MustParseNillableDuration("30m"), - ServeStale: v1beta1.LocalDNSServeStaleVerify, - }, - }, - } - } - test.ApplyDefaultStatus(nodeClass, env, testOptions.UseSIG) - if k8sVersion != "" { - nodeClass.Status.KubernetesVersion = lo.ToPtr(k8sVersion) - } - // Mirror what the resolver would do: set Status.LocalDNSState=Enabled - // when the resolution would land on Enabled. The test instance-type - // provider uses a nil resolver, so Status is the only path to - // surface Enabled. - setEnabledStatus := func() { - nodeClass.Status.LocalDNSState = lo.ToPtr(v1beta1.LocalDNSStateEnabled) - } - switch localDNSMode { - case v1beta1.LocalDNSModeRequired: - setEnabledStatus() - case v1beta1.LocalDNSModeDisabled: - // no status needed; Disabled mode -> false - case v1beta1.LocalDNSModePreferred: - threshold := semver.MustParse("1.36.0") - parsed, perr := semver.ParseTolerant(strings.TrimPrefix(k8sVersion, "v")) - if perr == nil && parsed.GTE(threshold) { - setEnabledStatus() - } - } - ExpectApplied(ctx, env.Client, nodeClass) - instanceTypes, err := azureEnv.InstanceTypesProvider.List(ctx, nodeClass) - Expect(err).ToNot(HaveOccurred()) - Expect(instanceTypes).ShouldNot(BeEmpty()) - - getName := func(instanceType *corecloudprovider.InstanceType) string { return instanceType.Name } - - if shouldIncludeD2s { - Expect(instanceTypes).Should(ContainElement(WithTransform(getName, Equal("Standard_D2s_v3"))), - "Standard_D2s_v3 (2 vCPUs) should be included") - } else { - Expect(instanceTypes).ShouldNot(ContainElement(WithTransform(getName, Equal("Standard_D2s_v3"))), - "Standard_D2s_v3 (2 vCPUs) should be excluded") - } - - if shouldIncludeD4s { - Expect(instanceTypes).Should(ContainElement(WithTransform(getName, Equal("Standard_D4s_v3"))), - "Standard_D4s_v3 (4 vCPUs) should be included") - } - }, - Entry("when LocalDNS is required - filters to 4+ vCPUs and 244+ MiB", - v1beta1.LocalDNSModeRequired, "", false, true), - Entry("when LocalDNS is preferred with k8s >= 1.36 - filters to 4+ vCPUs and 244+ MiB", - v1beta1.LocalDNSModePreferred, "1.36.0", false, true), - Entry("when LocalDNS is preferred with k8s < 1.36 - includes all SKUs", - v1beta1.LocalDNSModePreferred, "1.35.0", true, true), - Entry("when LocalDNS is disabled - includes all SKUs", - v1beta1.LocalDNSModeDisabled, "", true, true), - Entry("when LocalDNS is not set - includes all SKUs", - v1beta1.LocalDNSMode(""), "", true, true), - ) - - Context("Cache invalidation with LocalDNS", func() { - It("should return different instance type lists when LocalDNS mode changes", func() { - // First, get instance types with LocalDNS disabled - nodeClassDisabled := test.AKSNodeClass() - nodeClassDisabled.Spec.LocalDNS = &v1beta1.LocalDNS{ - Mode: v1beta1.LocalDNSModeDisabled, - VnetDNSOverrides: []v1beta1.LocalDNSZoneOverride{ - { - Zone: ".", - QueryLogging: v1beta1.LocalDNSQueryLoggingError, - Protocol: v1beta1.LocalDNSProtocolPreferUDP, - ForwardDestination: v1beta1.LocalDNSForwardDestinationVnetDNS, - ForwardPolicy: v1beta1.LocalDNSForwardPolicySequential, - MaxConcurrent: lo.ToPtr(int32(100)), - CacheDuration: karpv1.MustParseNillableDuration("1h"), - ServeStaleDuration: karpv1.MustParseNillableDuration("30m"), - ServeStale: v1beta1.LocalDNSServeStaleVerify, - }, - { - Zone: "cluster.local", - QueryLogging: v1beta1.LocalDNSQueryLoggingError, - Protocol: v1beta1.LocalDNSProtocolPreferUDP, - ForwardDestination: v1beta1.LocalDNSForwardDestinationClusterCoreDNS, - ForwardPolicy: v1beta1.LocalDNSForwardPolicySequential, - MaxConcurrent: lo.ToPtr(int32(100)), - CacheDuration: karpv1.MustParseNillableDuration("1h"), - ServeStaleDuration: karpv1.MustParseNillableDuration("30m"), - ServeStale: v1beta1.LocalDNSServeStaleVerify, - }, - }, - KubeDNSOverrides: []v1beta1.LocalDNSZoneOverride{ - { - Zone: ".", - QueryLogging: v1beta1.LocalDNSQueryLoggingError, - Protocol: v1beta1.LocalDNSProtocolPreferUDP, - ForwardDestination: v1beta1.LocalDNSForwardDestinationClusterCoreDNS, - ForwardPolicy: v1beta1.LocalDNSForwardPolicySequential, - MaxConcurrent: lo.ToPtr(int32(100)), - CacheDuration: karpv1.MustParseNillableDuration("1h"), - ServeStaleDuration: karpv1.MustParseNillableDuration("30m"), - ServeStale: v1beta1.LocalDNSServeStaleVerify, - }, - { - Zone: "cluster.local", - QueryLogging: v1beta1.LocalDNSQueryLoggingError, - Protocol: v1beta1.LocalDNSProtocolPreferUDP, - ForwardDestination: v1beta1.LocalDNSForwardDestinationClusterCoreDNS, - ForwardPolicy: v1beta1.LocalDNSForwardPolicySequential, - MaxConcurrent: lo.ToPtr(int32(100)), - CacheDuration: karpv1.MustParseNillableDuration("1h"), - ServeStaleDuration: karpv1.MustParseNillableDuration("30m"), - ServeStale: v1beta1.LocalDNSServeStaleVerify, - }, - }, - } - nodeClassDisabled.Status.LocalDNSState = lo.ToPtr(v1beta1.LocalDNSStateDisabled) - instanceTypesDisabled, err := azureEnv.InstanceTypesProvider.List(ctx, nodeClassDisabled) - Expect(err).ToNot(HaveOccurred()) - - // Now get instance types with LocalDNS required - nodeClassEnabled := test.AKSNodeClass() - nodeClassEnabled.Spec.LocalDNS = &v1beta1.LocalDNS{ - Mode: v1beta1.LocalDNSModeRequired, + DescribeTable("Filtering by LocalDNS", + func(localDNSMode v1beta1.LocalDNSMode, k8sVersion string, shouldIncludeD2s, shouldIncludeD4s bool) { + if localDNSMode != "" { + // Create complete LocalDNS configuration with all required fields + // Note: VnetDNS and KubeDNS overrides must contain both "." and "cluster.local" zones + nodeClass.Spec.LocalDNS = &v1beta1.LocalDNS{ + Mode: localDNSMode, VnetDNSOverrides: []v1beta1.LocalDNSZoneOverride{ { Zone: ".", @@ -847,1468 +177,390 @@ var _ = Describe("InstanceType Provider", func() { { Zone: "cluster.local", QueryLogging: v1beta1.LocalDNSQueryLoggingError, - Protocol: v1beta1.LocalDNSProtocolPreferUDP, - ForwardDestination: v1beta1.LocalDNSForwardDestinationClusterCoreDNS, - ForwardPolicy: v1beta1.LocalDNSForwardPolicySequential, - MaxConcurrent: lo.ToPtr(int32(100)), - CacheDuration: karpv1.MustParseNillableDuration("1h"), - ServeStaleDuration: karpv1.MustParseNillableDuration("30m"), - ServeStale: v1beta1.LocalDNSServeStaleVerify, - }, - }, - KubeDNSOverrides: []v1beta1.LocalDNSZoneOverride{ - { - Zone: ".", - QueryLogging: v1beta1.LocalDNSQueryLoggingError, - Protocol: v1beta1.LocalDNSProtocolPreferUDP, - ForwardDestination: v1beta1.LocalDNSForwardDestinationClusterCoreDNS, - ForwardPolicy: v1beta1.LocalDNSForwardPolicySequential, - MaxConcurrent: lo.ToPtr(int32(100)), - CacheDuration: karpv1.MustParseNillableDuration("1h"), - ServeStaleDuration: karpv1.MustParseNillableDuration("30m"), - ServeStale: v1beta1.LocalDNSServeStaleVerify, - }, - { - Zone: "cluster.local", - QueryLogging: v1beta1.LocalDNSQueryLoggingError, - Protocol: v1beta1.LocalDNSProtocolPreferUDP, - ForwardDestination: v1beta1.LocalDNSForwardDestinationClusterCoreDNS, - ForwardPolicy: v1beta1.LocalDNSForwardPolicySequential, - MaxConcurrent: lo.ToPtr(int32(100)), - CacheDuration: karpv1.MustParseNillableDuration("1h"), - ServeStaleDuration: karpv1.MustParseNillableDuration("30m"), - ServeStale: v1beta1.LocalDNSServeStaleVerify, - }, - }, - } - nodeClassEnabled.Status.LocalDNSState = lo.ToPtr(v1beta1.LocalDNSStateEnabled) - ExpectApplied(ctx, env.Client, nodeClassEnabled) - instanceTypesEnabled, err := azureEnv.InstanceTypesProvider.List(ctx, nodeClassEnabled) - Expect(err).ToNot(HaveOccurred()) - - // The lists should be different sizes - Expect(len(instanceTypesEnabled)).To(BeNumerically("<", len(instanceTypesDisabled)), - "LocalDNS Required should filter out small SKUs") - - getName := func(instanceType *corecloudprovider.InstanceType) string { return instanceType.Name } - - // Verify that small SKUs (< 4 vCPUs) are present when disabled but absent when enabled - Expect(instanceTypesDisabled).Should(ContainElement(WithTransform(getName, Equal("Standard_D2s_v3"))), - "Standard_D2s_v3 (2 vCPUs) should be included when LocalDNS is disabled") - Expect(instanceTypesEnabled).ShouldNot(ContainElement(WithTransform(getName, Equal("Standard_D2s_v3"))), - "Standard_D2s_v3 (2 vCPUs) should be excluded when LocalDNS is required") - - // Verify that large SKUs (>= 4 vCPUs) are present in both - Expect(instanceTypesDisabled).Should(ContainElement(WithTransform(getName, Equal("Standard_D4s_v3"))), - "Standard_D4s_v3 (4 vCPUs) should be included when LocalDNS is disabled") - Expect(instanceTypesEnabled).Should(ContainElement(WithTransform(getName, Equal("Standard_D4s_v3"))), - "Standard_D4s_v3 (4 vCPUs) should be included when LocalDNS is required") - }) - }) - - DescribeTable("Filtering by ArtifactStreaming", - func(artifactStreaming *v1beta1.ArtifactStreaming, shouldIncludeArm64 bool) { - nodeClass.Spec.ArtifactStreaming = artifactStreaming - test.ApplyDefaultStatus(nodeClass, env, testOptions.UseSIG) - ExpectApplied(ctx, env.Client, nodeClass) - instanceTypes, err := azureEnv.InstanceTypesProvider.List(ctx, nodeClass) - Expect(err).ToNot(HaveOccurred()) - Expect(instanceTypes).ShouldNot(BeEmpty()) - - getName := func(instanceType *corecloudprovider.InstanceType) string { return instanceType.Name } - - if shouldIncludeArm64 { - Expect(instanceTypes).Should(ContainElement(WithTransform(getName, Equal("Standard_D16plds_v5"))), - "ARM64 instance type Standard_D16plds_v5 should be included") - } else { - Expect(instanceTypes).ShouldNot(ContainElement(WithTransform(getName, Equal("Standard_D16plds_v5"))), - "ARM64 instance type Standard_D16plds_v5 should be excluded") - } - - // AMD64 instance types should always be included regardless of artifact streaming setting - Expect(instanceTypes).Should(ContainElement(WithTransform(getName, Equal("Standard_D2s_v3"))), - "AMD64 instance type Standard_D2s_v3 should always be included") - }, - Entry("when artifact streaming is not set (default) - includes ARM64", - nil, true), - Entry("when artifact streaming is explicitly enabled - excludes ARM64", - &v1beta1.ArtifactStreaming{Enabled: lo.ToPtr(true)}, false), - Entry("when artifact streaming is explicitly disabled - includes ARM64", - &v1beta1.ArtifactStreaming{Enabled: lo.ToPtr(false)}, true), - ) - - Context("Ephemeral Disk", func() { - var originalOptions *options.Options - BeforeEach(func() { - originalOptions = options.FromContext(ctx) - ctx = options.ToContext( - ctx, - test.Options(test.OptionsFields{ - UseSIG: lo.ToPtr(true), - })) - - // Repopilate instance types based on above ctx - Expect(azureEnv.InstanceTypesProvider.UpdateInstanceTypes(ctx)).To(Succeed()) - }) - - AfterEach(func() { - ctx = options.ToContext(ctx, originalOptions) - // Clean up instance types - Expect(azureEnv.InstanceTypesProvider.UpdateInstanceTypes(ctx)).To(Succeed()) - }) - - Context("FindMaxEphemeralSizeGBAndPlacement(sku *skewer.SKU) -> diskSizeGB, *placement", func() { - // B20ms: - // NvmeDiskSizeInMiB == 0 - // CacheDiskBytes == 32212254720 -> 32.21225472 GB .. we should select this as the ephemeral disk size - // placement == CacheDisk - // MaxResourceVolumeMB == 163840 MiB -> 171.80 GB, - // Standard_D128ds_v6: - // NvmeDiskSizeInMiB == 7208960 -> 7559.142441 GB // SupportedEphemeralOSDiskPlacements == NvmeDisk - // and this is greater than 0, so we select 7559, placement == NvmeDisk - // Standard_D16plds_v5: - // NvmeDiskSizeInMiB == 0 - // CacheDiskBytes == 429496729600 -> 429.4967296, this is greater than zero, so we select this as the ephemeral disk size - // placement == CacheDisk and size == 429.4967296 GB - // MaxResourceVolumeMB == 614400 MiB - // Standard_D2as_v6: -> EphemeralOSDiskSupported is false, it should return 0 and nil for placement - // Standard_D128ds_v6: - // NvmeDiskSizeInMiB == 7208960 -> 7559.142441 GB // SupportedEphemeralOSDiskPlacements == NvmeDisk - // and this is greater than 0, so we select 7559, placement == NvmeDisk - // Standard_NC24ads_A100_v4: - // {Name: lo.ToPtr("SupportedEphemeralOSDiskPlacements"), Value: lo.ToPtr("ResourceDisk,CacheDisk")}, - // NvmeDiskSizeInMiB == 915527 -> 959.99964 GB but no SupportedEphemeralOSDiskPlacements == NvmeDisk so we move to cache disk - // CacheDiskBytes == 274877906944 -> 274.877906944 GB so we select cache disk + 274 - // MaxResourceVolumeMB == 65536 MiB - // Standard_D64s_v3: - // NvmeDiskSizeInMiB == 0 - // CacheDiskBytes == 1717986918400 -> 1717.9869184 GB, this is greater than zero, so we select this as the ephemeral disk size - // placement == CacheDisk and size == 1717 GB - // Standard_A0 - // NvmeDiskSizeInMiB == 0 - // CacheDiskBytes == 0, this is zero - // MaxResourceVolumeMB == 20480 Mib -> 21.474836 GB. Note that this sku doesnt support ephemeral os disk - DescribeTable("should return the max ephemeral disk size in GB for a given instance type", - func(sku *skewer.SKU, expectedSize int64, expectedPlacement *armcompute.DiffDiskPlacement) { - sizeGB, placement := instancetype.FindMaxEphemeralSizeGBAndPlacement(sku) - Expect(sizeGB).To(Equal(expectedSize)) - Expect(placement).To(Equal(expectedPlacement)) - }, Entry("Standard_B20ms", fake.MakeSKU("Standard_B20ms"), int64(32), lo.ToPtr(armcompute.DiffDiskPlacementCacheDisk)), - Entry("Standard_D128ds_v6", fake.MakeSKU("Standard_D128ds_v6"), int64(7559), lo.ToPtr(armcompute.DiffDiskPlacementNvmeDisk)), - Entry("Standard_D16plds_v5", fake.MakeSKU("Standard_D16plds_v5"), int64(429), lo.ToPtr(armcompute.DiffDiskPlacementCacheDisk)), - Entry("Standard_D2as_v6", fake.MakeSKU("Standard_D2as_v6"), int64(0), nil), // does not support ephemeral - Entry("Standard_NC24ads_A100_v4", fake.MakeSKU("Standard_NC24ads_A100_v4"), int64(274), lo.ToPtr(armcompute.DiffDiskPlacementCacheDisk)), - Entry("Standard_D64s_v3", fake.MakeSKU("Standard_D64s_v3"), int64(1717), lo.ToPtr(armcompute.DiffDiskPlacementCacheDisk)), - Entry("Standard_A0", fake.MakeSKU("Standard_A0"), int64(0), nil), // does not support ephemeral - Entry("Standard_D2_v2", fake.MakeSKU("Standard_D2_v2"), int64(0), nil), // does not support ephemeral - // TODO: codegen - // Entry("Standard_D2pls_v5", fake.MakeSKU("Standard_D2pls_v5"), int64(0), nil), // does not support ephemeral - // Entry("Standard_D2lds_v5", fake.MakeSKU("Standard_D2lds_v5"), int64(80), armcompute.DiffDiskPlacementResourceDisk), - Entry("Nil SKU", nil, int64(0), nil), - ) - }) - Context("Placement", func() { - It("should prefer NVMe disk if supported for ephemeral", func() { - nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1.LabelInstanceTypeStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{"Standard_D128ds_v6"}, - }) - - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - - vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM - Expect(vm).NotTo(BeNil()) - Expect(vm.Properties.StorageProfile.OSDisk.DiffDiskSettings).NotTo(BeNil()) - Expect(lo.FromPtr(vm.Properties.StorageProfile.OSDisk.DiffDiskSettings.Placement)).To(Equal(armcompute.DiffDiskPlacementNvmeDisk)) - }) - It("should not select NVMe ephemeral disk placement if the sku has an nvme disk, supports ephemeral os disk, but doesnt support NVMe placement", func() { - nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1.LabelInstanceTypeStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{"Standard_NC24ads_A100_v4"}, - }) - - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - - vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM - Expect(vm).NotTo(BeNil()) - Expect(vm.Properties.StorageProfile.OSDisk.DiffDiskSettings).NotTo(BeNil()) - Expect(lo.FromPtr(vm.Properties.StorageProfile.OSDisk.DiffDiskSettings.Placement)).ToNot(Equal(armcompute.DiffDiskPlacementNvmeDisk)) - }) - It("should prefer cache disk placement when both cache and temp disk support ephemeral and fit the default 128GB threshold", func() { - nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1.LabelInstanceTypeStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{"Standard_D64s_v3"}, - }) - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - - vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM - Expect(vm).NotTo(BeNil()) - Expect(vm.Properties.StorageProfile.OSDisk.DiffDiskSettings).NotTo(BeNil()) - Expect(lo.FromPtr(vm.Properties.StorageProfile.OSDisk.DiffDiskSettings.Placement)).To(Equal(armcompute.DiffDiskPlacementCacheDisk)) - }) - It("should select managed disk if cache disk is too small but temp disk supports ephemeral and fits osDiskSizeGB to have parity with the AKS Nodepool API", func() { - nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1.LabelInstanceTypeStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{"Standard_B20ms"}, - }) - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - - vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM - Expect(vm).NotTo(BeNil()) - Expect(vm.Properties.StorageProfile.OSDisk.DiffDiskSettings).To(BeNil()) - }) - }) - It("should use ephemeral disk if supported, and has space of at least 128GB by default", func() { - // Create a NodePool that selects a sku that supports ephemeral - // SKU Standard_D64s_v3 has 1600GB of CacheDisk space, so we expect we can create an ephemeral disk with size 128GB - nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1.LabelInstanceTypeStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{"Standard_D64s_v3"}}) - - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - - vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM - Expect(vm).NotTo(BeNil()) - Expect(vm.Properties.StorageProfile.OSDisk.DiskSizeGB).NotTo(BeNil()) - Expect(*vm.Properties.StorageProfile.OSDisk.DiskSizeGB).To(Equal(int32(128))) - // should have local disk attached - Expect(vm.Properties.StorageProfile.OSDisk.DiffDiskSettings).NotTo(BeNil()) - Expect(lo.FromPtr(vm.Properties.StorageProfile.OSDisk.DiffDiskSettings.Option)).To(Equal(armcompute.DiffDiskOptionsLocal)) - }) - It("should fail to provision if ephemeral disk ask for is too large", func() { - nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1beta1.LabelSKUStorageEphemeralOSMaxSize, - Operator: v1.NodeSelectorOpGt, - Values: []string{"100000"}, - }) // No InstanceType will match this requirement - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectNotScheduled(ctx, env.Client, pod) - - }) - It("should select an ephemeral disk if LabelSKUStorageEphemeralOSMaxSize is set and os disk size fits", func() { - nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1beta1.LabelSKUStorageEphemeralOSMaxSize, - Operator: v1.NodeSelectorOpGt, - Values: []string{"0"}, - }) - nodeClass.Spec.OSDiskSizeGB = lo.ToPtr[int32](30) - - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - - vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM - Expect(vm).NotTo(BeNil()) - Expect(vm.Properties.StorageProfile.OSDisk.DiskSizeGB).NotTo(BeNil()) - Expect(*vm.Properties.StorageProfile.OSDisk.DiskSizeGB).To(Equal(int32(30))) - Expect(lo.FromPtr(vm.Properties.StorageProfile.OSDisk.DiffDiskSettings.Option)).To(Equal(armcompute.DiffDiskOptionsLocal)) - }) - It("should use ephemeral disk if supported, and set disk size to OSDiskSizeGB from node class", func() { - // Create a Nodepool that selects a sku that supports ephemeral - // SKU Standard_D64s_v3 has 1600GB of CacheDisk space, so we expect we can create an ephemeral disk with size 256GB - nodeClass.Spec.OSDiskSizeGB = lo.ToPtr[int32](256) - nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1.LabelInstanceTypeStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{"Standard_D64s_v3"}}) - - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - - vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM - Expect(vm).NotTo(BeNil()) - Expect(vm.Properties.StorageProfile.OSDisk.DiskSizeGB).NotTo(BeNil()) - Expect(*vm.Properties.StorageProfile.OSDisk.DiskSizeGB).To(Equal(int32(256))) - Expect(lo.FromPtr(vm.Properties.StorageProfile.OSDisk.DiffDiskSettings.Option)).To(Equal(armcompute.DiffDiskOptionsLocal)) - }) - It("should not use ephemeral disk if ephemeral is supported, but we don't have enough space", func() { - // Create a Nodepool that selects a sku that supports ephemeral Standard_D2s_v3 - // Standard_D2s_V3 has 53GB Of CacheDisk space, - // and has 16GB of Temp Disk Space. - // With our rule of 100GB being the minimum OSDiskSize, this VM should be created without local disk - nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1.LabelInstanceTypeStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{"Standard_D2s_v3"}}) - - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM - Expect(vm).NotTo(BeNil()) - Expect(vm.Properties.StorageProfile.OSDisk.DiskSizeGB).NotTo(BeNil()) - Expect(*vm.Properties.StorageProfile.OSDisk.DiskSizeGB).To(Equal(int32(128))) - Expect(vm.Properties.StorageProfile.OSDisk.DiffDiskSettings).To(BeNil()) - }) - - It("should select NvmeDisk for v6 skus with maxNvmeDiskSize > 0", func() { - nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1.LabelInstanceTypeStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{"Standard_D128ds_v6"}}) - nodeClass.Spec.OSDiskSizeGB = lo.ToPtr[int32](100) - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - - vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM - Expect(vm).NotTo(BeNil()) - - Expect(vm.Properties.StorageProfile.OSDisk.DiffDiskSettings).NotTo(BeNil()) - Expect(lo.FromPtr(vm.Properties.StorageProfile.OSDisk.DiffDiskSettings.Placement)).To(Equal(armcompute.DiffDiskPlacementNvmeDisk)) - }) - }) - - Context("Custom DNS", func() { - It("should support provisioning with custom DNS server from options", func() { - ctx = options.ToContext( - ctx, - test.Options(test.OptionsFields{ - ClusterDNSServiceIP: lo.ToPtr("10.244.0.1"), - }), - ) - - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - - customData := ExpectDecodedCustomData(azureEnv) - - expectedFlags := map[string]string{ - "cluster-dns": "10.244.0.1", - } - - ExpectKubeletFlags(azureEnv, customData, expectedFlags) - }) - }) - - Context("Nodepool with KubeletConfig", func() { - It("should support provisioning with kubeletConfig, computeResources and maxPods not specified", func() { - nodeClass.Spec.Kubelet = &v1beta1.KubeletConfiguration{ - CPUManagerPolicy: lo.ToPtr("static"), - CPUCFSQuota: lo.ToPtr(true), - CPUCFSQuotaPeriod: metav1.Duration{}, - ImageGCHighThresholdPercent: lo.ToPtr(int32(30)), - ImageGCLowThresholdPercent: lo.ToPtr(int32(20)), - TopologyManagerPolicy: lo.ToPtr("best-effort"), - AllowedUnsafeSysctls: []string{"Allowed", "Unsafe", "Sysctls"}, - ContainerLogMaxSize: lo.ToPtr("42Mi"), - ContainerLogMaxFiles: lo.ToPtr[int32](13), - PodPidsLimit: lo.ToPtr[int64](99), - } - - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - - customData := ExpectDecodedCustomData(azureEnv) - - expectedFlags := map[string]string{ - "eviction-hard": "memory.available<750Mi", - "image-gc-high-threshold": "30", - "image-gc-low-threshold": "20", - "cpu-cfs-quota": "true", - "max-pods": "250", - "topology-manager-policy": "best-effort", - "container-log-max-size": "42Mi", - "allowed-unsafe-sysctls": "Allowed,Unsafe,Sysctls", - "cpu-manager-policy": "static", - "container-log-max-files": "13", - "pod-max-pids": "99", - } - - ExpectKubeletFlags(azureEnv, customData, expectedFlags) - Expect(customData).To(SatisfyAny( // AKS default - ContainSubstring("--system-reserved=cpu=0,memory=0"), - ContainSubstring("--system-reserved=memory=0,cpu=0"), - )) - Expect(customData).To(SatisfyAny( // AKS calculation based on cpu and memory - ContainSubstring("--kube-reserved=cpu=100m,memory=1843Mi"), - ContainSubstring("--kube-reserved=memory=1843Mi,cpu=100m"), - )) - }) - }) - - Context("Nodepool with KubeletConfig on a kubenet Cluster", func() { - var originalOptions *options.Options - - BeforeEach(func() { - originalOptions = options.FromContext(ctx) - ctx = options.ToContext( - ctx, - test.Options(test.OptionsFields{ - NetworkPlugin: lo.ToPtr("kubenet"), - })) - }) - - AfterEach(func() { - ctx = options.ToContext(ctx, originalOptions) - }) - It("should not include cilium or azure cni vnet labels", func() { - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - - customData := ExpectDecodedCustomData(azureEnv) - // Since the network plugin is not "azure" it should not include the following kubeletLabels - Expect(customData).To(Not(SatisfyAny( - ContainSubstring("kubernetes.azure.com/network-subnet=aks-subnet"), - ContainSubstring("kubernetes.azure.com/nodenetwork-vnetguid=a519e60a-cac0-40b2-b883-084477fe6f5c"), - ContainSubstring("kubernetes.azure.com/podnetwork-type=overlay"), - ))) - }) - It("should support provisioning with kubeletConfig, computeResources and maxPods not specified", func() { - nodeClass.Spec.Kubelet = &v1beta1.KubeletConfiguration{ - CPUManagerPolicy: lo.ToPtr("static"), - CPUCFSQuota: lo.ToPtr(true), - CPUCFSQuotaPeriod: metav1.Duration{}, - ImageGCHighThresholdPercent: lo.ToPtr(int32(30)), - ImageGCLowThresholdPercent: lo.ToPtr(int32(20)), - TopologyManagerPolicy: lo.ToPtr("best-effort"), - AllowedUnsafeSysctls: []string{"Allowed", "Unsafe", "Sysctls"}, - ContainerLogMaxSize: lo.ToPtr("42Mi"), - ContainerLogMaxFiles: lo.ToPtr[int32](13), - PodPidsLimit: lo.ToPtr[int64](99), - } - - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - - customData := ExpectDecodedCustomData(azureEnv) - expectedFlags := map[string]string{ - "eviction-hard": "memory.available<750Mi", - "max-pods": "110", - "image-gc-low-threshold": "20", - "image-gc-high-threshold": "30", - "cpu-cfs-quota": "true", - "topology-manager-policy": "best-effort", - "container-log-max-size": "42Mi", - "allowed-unsafe-sysctls": "Allowed,Unsafe,Sysctls", - "cpu-manager-policy": "static", - "container-log-max-files": "13", - "pod-max-pids": "99", - } - ExpectKubeletFlags(azureEnv, customData, expectedFlags) - Expect(customData).To(SatisfyAny( // AKS default - ContainSubstring("--system-reserved=cpu=0,memory=0"), - ContainSubstring("--system-reserved=memory=0,cpu=0"), - )) - Expect(customData).To(SatisfyAny( // AKS calculation based on cpu and memory - ContainSubstring("--kube-reserved=cpu=100m,memory=1843Mi"), - ContainSubstring("--kube-reserved=memory=1843Mi,cpu=100m"), - )) - }) - It("should support provisioning with kubeletConfig, computeResources and maxPods specified", func() { - nodeClass.Spec.Kubelet = &v1beta1.KubeletConfiguration{ - CPUManagerPolicy: lo.ToPtr("static"), - CPUCFSQuota: lo.ToPtr(true), - CPUCFSQuotaPeriod: metav1.Duration{}, - ImageGCHighThresholdPercent: lo.ToPtr(int32(30)), - ImageGCLowThresholdPercent: lo.ToPtr(int32(20)), - TopologyManagerPolicy: lo.ToPtr("best-effort"), - AllowedUnsafeSysctls: []string{"Allowed", "Unsafe", "Sysctls"}, - ContainerLogMaxSize: lo.ToPtr("42Mi"), - ContainerLogMaxFiles: lo.ToPtr[int32](13), - PodPidsLimit: lo.ToPtr[int64](99), - } - nodeClass.Spec.MaxPods = lo.ToPtr(int32(15)) - - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - - customData := ExpectDecodedCustomData(azureEnv) - expectedFlags := map[string]string{ - "eviction-hard": "memory.available<750Mi", - "max-pods": "15", - "image-gc-low-threshold": "20", - "image-gc-high-threshold": "30", - "cpu-cfs-quota": "true", - "topology-manager-policy": "best-effort", - "container-log-max-size": "42Mi", - "allowed-unsafe-sysctls": "Allowed,Unsafe,Sysctls", - "cpu-manager-policy": "static", - "container-log-max-files": "13", - "pod-max-pids": "99", - } - - ExpectKubeletFlags(azureEnv, customData, expectedFlags) - Expect(customData).To(SatisfyAny( // AKS default - ContainSubstring("--system-reserved=cpu=0,memory=0"), - ContainSubstring("--system-reserved=memory=0,cpu=0"), - )) - Expect(customData).To(SatisfyAny( // AKS calculation based on cpu and memory - ContainSubstring("--kube-reserved=cpu=100m,memory=1843Mi"), - ContainSubstring("--kube-reserved=memory=1843Mi,cpu=100m"), - )) - }) - }) - - Context("ImageReference", func() { - It("should use shared image gallery images when options are set to UseSIG", func() { - options := test.Options(test.OptionsFields{ - UseSIG: lo.ToPtr(true), - }) - ctx = options.ToContext(ctx) - statusController := status.NewController(env.Client, azureEnv.KubernetesVersionProvider, azureEnv.ImageProvider, env.KubernetesInterface, env.KubernetesInterface, azureEnv.DynamicInterface, azureEnv.SubnetsAPI, azureEnv.DiskEncryptionSetsAPI, options.ParsedDiskEncryptionSetID, options.NetworkPolicy, options.NetworkPlugin) - - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - - // Expect virtual machine to have a shared image gallery id set on it - Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM - Expect(vm.Properties.StorageProfile.ImageReference).ToNot(BeNil()) - Expect(vm.Properties.StorageProfile.ImageReference.ID).ShouldNot(BeNil()) - Expect(vm.Properties.StorageProfile.ImageReference.CommunityGalleryImageID).Should(BeNil()) - - Expect(*vm.Properties.StorageProfile.ImageReference.ID).To(ContainSubstring(options.SIGSubscriptionID)) - Expect(*vm.Properties.StorageProfile.ImageReference.ID).To(ContainSubstring("AKSUbuntu")) - }) - It("should use Community Images when options are set to UseSIG=false", func() { - options := test.Options(test.OptionsFields{ - UseSIG: lo.ToPtr(false), - }) - ctx = options.ToContext(ctx) - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - - Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM - Expect(vm.Properties.StorageProfile.ImageReference.CommunityGalleryImageID).Should(Not(BeNil())) - - }) - - }) - - Context("ImageProvider + Image Family", func() { - - kubernetesVersion := lo.Must(env.KubernetesInterface.Discovery().ServerVersion()).String() - expectUseAzureLinux3 := imagefamily.UseAzureLinux3(kubernetesVersion) - azureLinuxGen2ImageDefinition := lo.Ternary(expectUseAzureLinux3, imagefamily.AzureLinux3Gen2ImageDefinition, imagefamily.AzureLinuxGen2ImageDefinition) - azureLinuxGen1ImageDefinition := lo.Ternary(expectUseAzureLinux3, imagefamily.AzureLinux3Gen1ImageDefinition, imagefamily.AzureLinuxGen1ImageDefinition) - azureLinuxGen2ArmImageDefinition := lo.Ternary(expectUseAzureLinux3, imagefamily.AzureLinux3Gen2ArmImageDefinition, imagefamily.AzureLinuxGen2ArmImageDefinition) - - DescribeTable("should select the right Shared Image Gallery image for a given instance type", func(instanceType string, imageFamily string, expectedImageDefinition string, expectedGalleryRG string, expectedGalleryURL string) { - options := test.Options(test.OptionsFields{ - UseSIG: lo.ToPtr(true), - }) - ctx = options.ToContext(ctx) - statusController := status.NewController(env.Client, azureEnv.KubernetesVersionProvider, azureEnv.ImageProvider, env.KubernetesInterface, env.KubernetesInterface, azureEnv.DynamicInterface, azureEnv.SubnetsAPI, azureEnv.DiskEncryptionSetsAPI, options.ParsedDiskEncryptionSetID, options.NetworkPolicy, options.NetworkPlugin) - - nodeClass.Spec.ImageFamily = lo.ToPtr(imageFamily) - coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1.LabelInstanceTypeStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{instanceType}}) - - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - - Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM - Expect(vm.Properties.StorageProfile.ImageReference).ToNot(BeNil()) - - expectedPrefix := fmt.Sprintf("/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Compute/galleries/%s/images/%s", options.SIGSubscriptionID, expectedGalleryRG, expectedGalleryURL, expectedImageDefinition) - Expect(*vm.Properties.StorageProfile.ImageReference.ID).To(ContainSubstring(expectedPrefix)) - - }, - - Entry("Gen2, Gen1 instance type with AKSUbuntu image family", "Standard_D2_v5", v1beta1.Ubuntu2204ImageFamily, imagefamily.Ubuntu2204Gen2ImageDefinition, imagefamily.AKSUbuntuResourceGroup, imagefamily.AKSUbuntuGalleryName), - Entry("Gen1 instance type with AKSUbuntu image family", "Standard_D2_v3", v1beta1.Ubuntu2204ImageFamily, imagefamily.Ubuntu2204Gen1ImageDefinition, imagefamily.AKSUbuntuResourceGroup, imagefamily.AKSUbuntuGalleryName), - Entry("ARM instance type with AKSUbuntu image family", "Standard_D16plds_v5", v1beta1.Ubuntu2204ImageFamily, imagefamily.Ubuntu2204Gen2ArmImageDefinition, imagefamily.AKSUbuntuResourceGroup, imagefamily.AKSUbuntuGalleryName), - Entry("Gen2 instance type with AzureLinux image family", "Standard_D2_v5", v1beta1.AzureLinuxImageFamily, azureLinuxGen2ImageDefinition, imagefamily.AKSAzureLinuxResourceGroup, imagefamily.AKSAzureLinuxGalleryName), - Entry("Gen1 instance type with AzureLinux image family", "Standard_D2_v3", v1beta1.AzureLinuxImageFamily, azureLinuxGen1ImageDefinition, imagefamily.AKSAzureLinuxResourceGroup, imagefamily.AKSAzureLinuxGalleryName), - Entry("ARM instance type with AzureLinux image family", "Standard_D16plds_v5", v1beta1.AzureLinuxImageFamily, azureLinuxGen2ArmImageDefinition, imagefamily.AKSAzureLinuxResourceGroup, imagefamily.AKSAzureLinuxGalleryName), - ) - DescribeTable("should select the right image for a given instance type", - func(instanceType string, imageFamily string, expectedImageDefinition string, expectedGalleryURL string) { - statusController := status.NewController(env.Client, azureEnv.KubernetesVersionProvider, azureEnv.ImageProvider, env.KubernetesInterface, env.KubernetesInterface, azureEnv.DynamicInterface, azureEnv.SubnetsAPI, azureEnv.DiskEncryptionSetsAPI, testOptions.ParsedDiskEncryptionSetID, options.FromContext(ctx).NetworkPolicy, options.FromContext(ctx).NetworkPlugin) - nodeClass.Spec.ImageFamily = lo.ToPtr(imageFamily) - coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1.LabelInstanceTypeStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{instanceType}}) - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - - Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM - Expect(vm.Properties.StorageProfile.ImageReference).ToNot(BeNil()) - Expect(vm.Properties.StorageProfile.ImageReference.CommunityGalleryImageID).ToNot(BeNil()) - parts := strings.Split(*vm.Properties.StorageProfile.ImageReference.CommunityGalleryImageID, "/") - Expect(parts[2]).To(Equal(expectedGalleryURL)) - Expect(parts[4]).To(Equal(expectedImageDefinition)) - - // Need to reset env since we are doing these nested tests - cluster.Reset() - azureEnv.Reset(ctx) - }, - Entry("Gen2, Gen1 instance type with AKSUbuntu image family", - "Standard_D2_v5", v1beta1.Ubuntu2204ImageFamily, imagefamily.Ubuntu2204Gen2ImageDefinition, imagefamily.AKSUbuntuPublicGalleryURL), - Entry("Gen1 instance type with AKSUbuntu image family", - "Standard_D2_v3", v1beta1.Ubuntu2204ImageFamily, imagefamily.Ubuntu2204Gen1ImageDefinition, imagefamily.AKSUbuntuPublicGalleryURL), - Entry("ARM instance type with AKSUbuntu image family", - "Standard_D16plds_v5", v1beta1.Ubuntu2204ImageFamily, imagefamily.Ubuntu2204Gen2ArmImageDefinition, imagefamily.AKSUbuntuPublicGalleryURL), - Entry("Gen2 instance type with AzureLinux image family", - "Standard_D2_v5", v1beta1.AzureLinuxImageFamily, azureLinuxGen2ImageDefinition, imagefamily.AKSAzureLinuxPublicGalleryURL), - Entry("Gen1 instance type with AzureLinux image family", - "Standard_D2_v3", v1beta1.AzureLinuxImageFamily, azureLinuxGen1ImageDefinition, imagefamily.AKSAzureLinuxPublicGalleryURL), - Entry("ARM instance type with AzureLinux image family", - "Standard_D16plds_v5", v1beta1.AzureLinuxImageFamily, azureLinuxGen2ArmImageDefinition, imagefamily.AKSAzureLinuxPublicGalleryURL), - ) - }) - - Context("Instance Types", func() { - It("should support provisioning with no labels", func() { - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - }) - It("should have VM identity set", func() { - ctx = options.ToContext( - ctx, - test.Options(test.OptionsFields{ - NodeIdentities: []string{ - "/subscriptions/1234/resourceGroups/mcrg/providers/Microsoft.ManagedIdentity/userAssignedIdentities/myid1", - "/subscriptions/1234/resourceGroups/mcrg/providers/Microsoft.ManagedIdentity/userAssignedIdentities/myid2", - }, - })) - - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - - Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM - Expect(vm.Identity).ToNot(BeNil()) - - Expect(lo.FromPtr(vm.Identity.Type)).To(Equal(armcompute.ResourceIdentityTypeUserAssigned)) - Expect(vm.Identity.UserAssignedIdentities).ToNot(BeNil()) - Expect(vm.Identity.UserAssignedIdentities).To(HaveLen(2)) - Expect(vm.Identity.UserAssignedIdentities).To(HaveKey("/subscriptions/1234/resourceGroups/mcrg/providers/Microsoft.ManagedIdentity/userAssignedIdentities/myid1")) - Expect(vm.Identity.UserAssignedIdentities).To(HaveKey("/subscriptions/1234/resourceGroups/mcrg/providers/Microsoft.ManagedIdentity/userAssignedIdentities/myid2")) - }) - Context("VM Profile", func() { - It("should have OS disk and network interface set to auto-delete", func() { - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - - Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM - Expect(vm.Properties).ToNot(BeNil()) - - Expect(vm.Properties.StorageProfile).ToNot(BeNil()) - Expect(vm.Properties.StorageProfile.OSDisk).ToNot(BeNil()) - osDiskDeleteOption := vm.Properties.StorageProfile.OSDisk.DeleteOption - Expect(osDiskDeleteOption).ToNot(BeNil()) - Expect(lo.FromPtr(osDiskDeleteOption)).To(Equal(armcompute.DiskDeleteOptionTypesDelete)) - - Expect(vm.Properties.StorageProfile.ImageReference).ToNot(BeNil()) - - for _, nic := range vm.Properties.NetworkProfile.NetworkInterfaces { - nicDeleteOption := nic.Properties.DeleteOption - Expect(nicDeleteOption).To(Not(BeNil())) - Expect(lo.FromPtr(nicDeleteOption)).To(Equal(armcompute.DeleteOptionsDelete)) - } - }) - It("should not create unneeded secondary ips for azure cni with overlay", func() { - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - - Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM - Expect(vm.Properties).ToNot(BeNil()) - - Expect(vm.Properties.StorageProfile.ImageReference).ToNot(BeNil()) - Expect(len(vm.Properties.NetworkProfile.NetworkInterfaces)).To(Equal(1)) - Expect(lo.FromPtr(vm.Properties.NetworkProfile.NetworkInterfaces[0].Properties.Primary)).To(BeTrue()) - - Expect(azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - nic := azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Pop().Interface - Expect(nic.Properties).ToNot(BeNil()) - - Expect(len(nic.Properties.IPConfigurations)).To(Equal(1)) - }) - }) - }) - - Context("GPU Workloads + Nodes", func() { - It("should schedule non-GPU pod onto the cheapest non-GPU capable node", func() { - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - node := ExpectScheduled(ctx, env.Client, pod) - - Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM - Expect(vm.Properties).ToNot(BeNil()) - Expect(vm.Properties.HardwareProfile).ToNot(BeNil()) - Expect(utils.IsNvidiaEnabledSKU(string(*vm.Properties.HardwareProfile.VMSize))).To(BeFalse()) - - Expect(node.Labels).To(HaveKeyWithValue("karpenter.azure.com/sku-gpu-count", "0")) - }) - - It("should schedule GPU pod on GPU capable node", func() { - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod(coretest.PodOptions{ - ObjectMeta: metav1.ObjectMeta{ - Name: "samples-tf-mnist-demo", - Labels: map[string]string{ - "app": "samples-tf-mnist-demo", - }, - }, - Image: "mcr.microsoft.com/azuredocs/samples-tf-mnist-demo:gpu", - ResourceRequirements: v1.ResourceRequirements{ - Limits: v1.ResourceList{ - "nvidia.com/gpu": resource.MustParse("1"), - }, - }, - RestartPolicy: v1.RestartPolicy("OnFailure"), - Tolerations: []v1.Toleration{ - { - Key: "sku", - Operator: v1.TolerationOpEqual, - Value: "gpu", - Effect: v1.TaintEffectNoSchedule, - }, - }, - }) - - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - node := ExpectScheduled(ctx, env.Client, pod) - - // the following checks assume Standard_NC16as_T4_v3 (surprisingly the cheapest GPU in the test set), so test the assumption - Expect(node.Labels).To(HaveKeyWithValue(v1.LabelInstanceTypeStable, "Standard_NC16as_T4_v3")) - - // Verify GPU related settings in bootstrap (assuming one Standard_NC16as_T4_v3) - customData := ExpectDecodedCustomData(azureEnv) - Expect(customData).To(SatisfyAll( - ContainSubstring("GPU_NODE=true"), - ContainSubstring("SGX_NODE=false"), - ContainSubstring("MIG_NODE=false"), - ContainSubstring("CONFIG_GPU_DRIVER_IF_NEEDED=true"), - ContainSubstring("ENABLE_GPU_DEVICE_PLUGIN_IF_NEEDED=false"), - ContainSubstring("GPU_DRIVER_TYPE=\"cuda\""), - ContainSubstring(fmt.Sprintf("GPU_DRIVER_VERSION=\"%s\"", utils.NvidiaCudaDriverVersion)), - ContainSubstring(fmt.Sprintf("GPU_IMAGE_SHA=\"%s\"", utils.AKSGPUCudaVersionSuffix)), - ContainSubstring("GPU_NEEDS_FABRIC_MANAGER=\"false\""), - ContainSubstring("GPU_INSTANCE_PROFILE=\"\""), - )) - - // Verify that the node the pod was scheduled on has GPU resource and labels set - Expect(node.Status.Allocatable).To(HaveKeyWithValue(v1.ResourceName("nvidia.com/gpu"), resource.MustParse("1"))) - Expect(node.Labels).To(HaveKeyWithValue("karpenter.azure.com/sku-gpu-name", "T4")) - Expect(node.Labels).To(HaveKeyWithValue("karpenter.azure.com/sku-gpu-manufacturer", v1beta1.ManufacturerNvidia)) - Expect(node.Labels).To(HaveKeyWithValue("karpenter.azure.com/sku-gpu-count", "1")) - }) - }) - - Context("Bootstrap", func() { - var ( - kubeletFlags string - customData string - minorVersion uint64 - credentialProviderURL string - ) - BeforeEach(func() { - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - customData = ExpectDecodedCustomData(azureEnv) - kubeletFlags = ExpectKubeletFlagsPassed(customData) - - k8sVersion, err := azureEnv.KubernetesVersionProvider.KubeServerVersion(ctx) - Expect(err).To(BeNil()) - minorVersion = semver.MustParse(k8sVersion).Minor - credentialProviderURL = bootstrap.CredentialProviderURL(k8sVersion, "amd64") - }) - - It("should include or exclude --keep-terminated-pod-volumes based on kubelet version", func() { - if minorVersion < 31 { - Expect(kubeletFlags).To(ContainSubstring("--keep-terminated-pod-volumes")) - } else { - Expect(kubeletFlags).ToNot(ContainSubstring("--keep-terminated-pod-volumes")) - } - }) - - It("should include correct flags and credential provider URL when CredentialProviderURL is not empty", func() { - if credentialProviderURL != "" { - Expect(kubeletFlags).ToNot(ContainSubstring("--azure-container-registry-config")) - Expect(kubeletFlags).To(ContainSubstring("--image-credential-provider-config=/var/lib/kubelet/credential-provider-config.yaml")) - Expect(kubeletFlags).To(ContainSubstring("--image-credential-provider-bin-dir=/var/lib/kubelet/credential-provider")) - Expect(customData).To(ContainSubstring(credentialProviderURL)) - } - }) - - It("should include correct flags when CredentialProviderURL is empty", func() { - if credentialProviderURL == "" { - Expect(kubeletFlags).To(ContainSubstring("--azure-container-registry-config")) - Expect(kubeletFlags).ToNot(ContainSubstring("--image-credential-provider-config")) - Expect(kubeletFlags).ToNot(ContainSubstring("--image-credential-provider-bin-dir")) - } - }) - - It("should include karpenter.sh/unregistered taint", func() { - Expect(kubeletFlags).To(ContainSubstring("--register-with-taints=" + karpv1.UnregisteredNoExecuteTaint.ToString())) - }) - }) - - DescribeTable("Azure CNI node labels and agentbaker network plugin", func( - networkPlugin, networkPluginMode, networkDataplane, expectedAgentBakerNetPlugin string, - expectedNodeLabels sets.Set[string]) { - options := test.Options(test.OptionsFields{ - NetworkPlugin: lo.ToPtr(networkPlugin), - NetworkPluginMode: lo.ToPtr(networkPluginMode), - NetworkDataplane: lo.ToPtr(networkDataplane), - }) - ctx = options.ToContext(ctx) - - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - customData := ExpectDecodedCustomData(azureEnv) - - Expect(customData).To(ContainSubstring(fmt.Sprintf("NETWORK_PLUGIN=%s", expectedAgentBakerNetPlugin))) - - for label := range expectedNodeLabels { - Expect(customData).To(ContainSubstring(label)) - } - }, - Entry("Azure CNI V1", - "azure", "", "", - "azure", sets.New[string]()), - Entry("Azure CNI w Overlay", - "azure", "overlay", "", - "none", - sets.New( - "kubernetes.azure.com/azure-cni-overlay=true", - "kubernetes.azure.com/network-subnet=aks-subnet", - "kubernetes.azure.com/nodenetwork-vnetguid=a519e60a-cac0-40b2-b883-084477fe6f5c", - "kubernetes.azure.com/podnetwork-type=overlay", - )), - Entry("Network Plugin none", - "none", "", "", "none", - sets.New[string]()), - Entry("Azure CNI w Overlay w Cilium", - "azure", "overlay", "cilium", - "none", - sets.New( - "kubernetes.azure.com/azure-cni-overlay=true", - "kubernetes.azure.com/network-subnet=aks-subnet", - "kubernetes.azure.com/nodenetwork-vnetguid=a519e60a-cac0-40b2-b883-084477fe6f5c", - "kubernetes.azure.com/podnetwork-type=overlay", - "kubernetes.azure.com/ebpf-dataplane=cilium", - )), - Entry("Cilium w feature flag Microsoft.ContainerService/EnableCiliumNodeSubnet", - "azure", "", "cilium", - "none", - sets.New("kubernetes.azure.com/ebpf-dataplane=cilium")), - ) - - Context("LoadBalancer", func() { - resourceGroup := "test-resourceGroup" - - It("should include loadbalancer backend pools the allocated VMs", func() { - standardLB := test.MakeStandardLoadBalancer(resourceGroup, loadbalancer.SLBName, true) - internalLB := test.MakeStandardLoadBalancer(resourceGroup, loadbalancer.InternalSLBName, false) - - azureEnv.LoadBalancersAPI.LoadBalancers.Store(lo.FromPtr(standardLB.ID), standardLB) - azureEnv.LoadBalancersAPI.LoadBalancers.Store(lo.FromPtr(internalLB.ID), internalLB) - - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - - Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - iface := azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Pop().Interface - - Expect(iface.Properties.IPConfigurations).ToNot(BeEmpty()) - Expect(lo.FromPtr(iface.Properties.IPConfigurations[0].Properties.Primary)).To(Equal(true)) - - backendPools := iface.Properties.IPConfigurations[0].Properties.LoadBalancerBackendAddressPools - Expect(backendPools).To(HaveLen(3)) - Expect(lo.FromPtr(backendPools[0].ID)).To(Equal("/subscriptions/subscriptionID/resourceGroups/test-resourceGroup/providers/Microsoft.Network/loadBalancers/kubernetes/backendAddressPools/kubernetes")) - Expect(lo.FromPtr(backendPools[1].ID)).To(Equal("/subscriptions/subscriptionID/resourceGroups/test-resourceGroup/providers/Microsoft.Network/loadBalancers/kubernetes/backendAddressPools/aksOutboundBackendPool")) - Expect(lo.FromPtr(backendPools[2].ID)).To(Equal("/subscriptions/subscriptionID/resourceGroups/test-resourceGroup/providers/Microsoft.Network/loadBalancers/kubernetes-internal/backendAddressPools/kubernetes")) - }) - }) - - Context("Zone-aware provisioning", func() { - It("should prefer zonal placement for zone-capable instance types by default", func() { - coretest.ReplaceRequirements(nodePool, - karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1.LabelInstanceTypeStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{"Standard_NC24ads_A100_v4"}}, - karpv1.NodeSelectorRequirementWithMinValues{ - Key: karpv1.CapacityTypeLabelKey, - Operator: v1.NodeSelectorOpIn, - Values: []string{karpv1.CapacityTypeOnDemand}}, - ) - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - - node := ExpectScheduled(ctx, env.Client, pod) - Expect(node.Labels).To(HaveKeyWithValue(v1beta1.LabelPlacementScope, v1beta1.PlacementScopeZonal)) - Expect(node.Labels[v1.LabelTopologyZone]).ToNot(Equal(zones.Regional)) - - vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM - Expect(vm.Zones).ToNot(BeEmpty()) - }) - - It("should launch zone-capable instance types regionally when placement scope requires it", func() { - coretest.ReplaceRequirements(nodePool, - karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1.LabelInstanceTypeStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{"Standard_NC24ads_A100_v4"}}, - karpv1.NodeSelectorRequirementWithMinValues{ - Key: karpv1.CapacityTypeLabelKey, - Operator: v1.NodeSelectorOpIn, - Values: []string{karpv1.CapacityTypeOnDemand}}, - karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1beta1.LabelPlacementScope, - Operator: v1.NodeSelectorOpIn, - Values: []string{v1beta1.PlacementScopeRegional}}, - ) - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - - node := ExpectScheduled(ctx, env.Client, pod) - Expect(node.Labels).To(HaveKeyWithValue(v1.LabelTopologyZone, zones.Regional)) - Expect(node.Labels).To(HaveKeyWithValue(v1beta1.LabelPlacementScope, v1beta1.PlacementScopeRegional)) - - vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM - Expect(vm.Zones).To(BeEmpty()) - }) - - It("should launch in the NodePool-requested zone", func() { - zone, vmZone := fmt.Sprintf("%s-3", fake.Region), "3" - nodePool.Spec.Template.Spec.Requirements = []karpv1.NodeSelectorRequirementWithMinValues{ - {Key: karpv1.CapacityTypeLabelKey, Operator: v1.NodeSelectorOpIn, Values: []string{karpv1.CapacityTypeSpot, karpv1.CapacityTypeOnDemand}}, - {Key: v1.LabelTopologyZone, Operator: v1.NodeSelectorOpIn, Values: []string{zone}}, - } - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - node := ExpectScheduled(ctx, env.Client, pod) - Expect(node.Labels).To(HaveKeyWithValue(v1.LabelTopologyZone, zone)) - - vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM - Expect(vm).NotTo(BeNil()) - Expect(vm.Zones).To(ConsistOf(&vmZone)) - }) - It("should support provisioning in non-zonal regions", func() { - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, clusterNonZonal, cloudProviderNonZonal, coreProvisionerNonZonal, azureEnvNonZonal, pod) - ExpectScheduled(ctx, env.Client, pod) - - Expect(azureEnvNonZonal.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - vm := azureEnvNonZonal.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM - Expect(vm.Zones).To(BeEmpty()) - }) - It("should provision non-zonal instance types in zonal regions with zone label 0", func() { - coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1.LabelInstanceTypeStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{"Standard_NC6s_v3"}}) - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - - node := ExpectScheduled(ctx, env.Client, pod) - Expect(node.Labels).To(HaveKeyWithValue(v1.LabelTopologyZone, zones.Regional)) - - Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM - Expect(vm.Zones).To(BeEmpty()) - }) - It("should not include empty zone domain in instance type offerings", func() { - // Verify that no instance type has an offering with zone="" - // which would introduce a phantom domain in topology spread constraint calculations. - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - instanceTypes, err := cloudProvider.GetInstanceTypes(ctx, nodePool) - Expect(err).ToNot(HaveOccurred()) - Expect(instanceTypes).ToNot(BeEmpty()) - - for _, it := range instanceTypes { - for _, offering := range it.Offerings { - zone := offering.Requirements.Get(v1.LabelTopologyZone).Any() - Expect(zone).ToNot(BeEmpty(), - fmt.Sprintf("instance type %s has an offering with empty zone, which breaks topology spread constraints", it.Name)) - } - } - }) - It("should exclude non-zonal instance types via zone NodePool requirements", func() { - // Users can filter out non-zonal SKUs by constraining zones to specific AZs. - // Non-zonal SKUs have zone "0", so requiring a specific zone prevents them - // from being scheduled. - coretest.ReplaceRequirements(nodePool, - karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1.LabelInstanceTypeStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{"Standard_NC6s_v3"}}, // non-zonal SKU - karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1.LabelTopologyZone, - Operator: v1.NodeSelectorOpIn, - Values: []string{fakeZone1}, - }, - ) - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - // Non-zonal SKU (zone="0") is incompatible with the zone requirement (zone-1), - // so the pod should not be scheduled. - ExpectNotScheduled(ctx, env.Client, pod) - }) - It("should exclude non-zonal instance types when all real zones are specified", func() { - // Specifying all availability zones still excludes non-zonal SKUs, - // since their zone "0" is not in the allowed list. - coretest.ReplaceRequirements(nodePool, - karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1.LabelInstanceTypeStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{"Standard_NC6s_v3"}}, // non-zonal SKU - karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1.LabelTopologyZone, - Operator: v1.NodeSelectorOpIn, - Values: azureEnv.Zones(), - }, - ) - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectNotScheduled(ctx, env.Client, pod) - }) - It("should schedule pods with zonal topology spread when non-zonal SKUs exist", func() { - // Reproduces https://github.com/Azure/karpenter-provider-azure/issues/1384 - // Previously, non-zonal SKUs had zone="" which collided with Karpenter core's - // sentinel value for "no domain found", making topology spread always unsatisfiable. - // With zone="0", the non-zonal domain is valid and doesn't poison the spread calculation. - podLabels := map[string]string{"app": "tsc-repro"} - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pods := []*v1.Pod{} - for i := 0; i < 3; i++ { - pods = append(pods, coretest.UnschedulablePod(coretest.PodOptions{ - ObjectMeta: metav1.ObjectMeta{Labels: podLabels}, - TopologySpreadConstraints: []v1.TopologySpreadConstraint{ - { - MaxSkew: 1, - TopologyKey: v1.LabelTopologyZone, - WhenUnsatisfiable: v1.DoNotSchedule, - LabelSelector: &metav1.LabelSelector{MatchLabels: podLabels}, - }, - }, - })) - } - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pods...) - for _, pod := range pods { - ExpectScheduled(ctx, env.Client, pod) - } - }) - }) - - Context("CloudProvider Create Error Cases", func() { - It("should return error when NodeClass readiness is Unknown", func() { - nodeClass.StatusConditions().SetUnknown(corestatus.ConditionReady) - nodeClaim := coretest.NodeClaim(karpv1.NodeClaim{ - ObjectMeta: metav1.ObjectMeta{ - Labels: map[string]string{ - karpv1.NodePoolLabelKey: nodePool.Name, + Protocol: v1beta1.LocalDNSProtocolPreferUDP, + ForwardDestination: v1beta1.LocalDNSForwardDestinationClusterCoreDNS, + ForwardPolicy: v1beta1.LocalDNSForwardPolicySequential, + MaxConcurrent: lo.ToPtr(int32(100)), + CacheDuration: karpv1.MustParseNillableDuration("1h"), + ServeStaleDuration: karpv1.MustParseNillableDuration("30m"), + ServeStale: v1beta1.LocalDNSServeStaleVerify, }, }, - Spec: karpv1.NodeClaimSpec{ - NodeClassRef: &karpv1.NodeClassReference{ - Name: nodeClass.Name, - Group: object.GVK(nodeClass).Group, - Kind: object.GVK(nodeClass).Kind, + KubeDNSOverrides: []v1beta1.LocalDNSZoneOverride{ + { + Zone: ".", + QueryLogging: v1beta1.LocalDNSQueryLoggingError, + Protocol: v1beta1.LocalDNSProtocolPreferUDP, + ForwardDestination: v1beta1.LocalDNSForwardDestinationClusterCoreDNS, + ForwardPolicy: v1beta1.LocalDNSForwardPolicySequential, + MaxConcurrent: lo.ToPtr(int32(100)), + CacheDuration: karpv1.MustParseNillableDuration("1h"), + ServeStaleDuration: karpv1.MustParseNillableDuration("30m"), + ServeStale: v1beta1.LocalDNSServeStaleVerify, + }, + { + Zone: "cluster.local", + QueryLogging: v1beta1.LocalDNSQueryLoggingError, + Protocol: v1beta1.LocalDNSProtocolPreferUDP, + ForwardDestination: v1beta1.LocalDNSForwardDestinationClusterCoreDNS, + ForwardPolicy: v1beta1.LocalDNSForwardPolicySequential, + MaxConcurrent: lo.ToPtr(int32(100)), + CacheDuration: karpv1.MustParseNillableDuration("1h"), + ServeStaleDuration: karpv1.MustParseNillableDuration("30m"), + ServeStale: v1beta1.LocalDNSServeStaleVerify, }, }, - }) - - ExpectApplied(ctx, env.Client, nodePool, nodeClass, nodeClaim) - claim, err := CreateAndWaitForPromises(ctx, cloudProvider, azureEnv, nodeClaim) - Expect(err).To(HaveOccurred()) - Expect(err).To(BeAssignableToTypeOf(&corecloudprovider.CreateError{})) - Expect(claim).To(BeNil()) - Expect(err.Error()).To(ContainSubstring("resolving NodeClass readiness, NodeClass is in Ready=Unknown")) - }) - - It("should return error when instance type resolution fails", func() { - // Create and set up the status controller - statusController := status.NewController(env.Client, azureEnv.KubernetesVersionProvider, azureEnv.ImageProvider, env.KubernetesInterface, env.KubernetesInterface, azureEnv.DynamicInterface, azureEnv.SubnetsAPI, azureEnv.DiskEncryptionSetsAPI, testOptions.ParsedDiskEncryptionSetID, options.FromContext(ctx).NetworkPolicy, options.FromContext(ctx).NetworkPlugin) - - // Set NodeClass to Ready - nodeClass.StatusConditions().SetTrue(karpv1.ConditionTypeLaunched) - ExpectApplied(ctx, env.Client, nodePool, nodeClass) + } + } + test.ApplyDefaultStatus(nodeClass, env, testOptions.UseSIG) + if k8sVersion != "" { + nodeClass.Status.KubernetesVersion = lo.ToPtr(k8sVersion) + } + // Mirror what the resolver would do: set Status.LocalDNSState=Enabled + // when the resolution would land on Enabled. The test instance-type + // provider uses a nil resolver, so Status is the only path to + // surface Enabled. + setEnabledStatus := func() { + nodeClass.Status.LocalDNSState = lo.ToPtr(v1beta1.LocalDNSStateEnabled) + } + switch localDNSMode { + case v1beta1.LocalDNSModeRequired: + setEnabledStatus() + case v1beta1.LocalDNSModeDisabled: + // no status needed; Disabled mode -> false + case v1beta1.LocalDNSModePreferred: + threshold := semver.MustParse("1.36.0") + parsed, perr := semver.ParseTolerant(strings.TrimPrefix(k8sVersion, "v")) + if perr == nil && parsed.GTE(threshold) { + setEnabledStatus() + } + } + ExpectApplied(ctx, env.Client, nodeClass) + instanceTypes, err := azureEnv.InstanceTypesProvider.List(ctx, nodeClass) + Expect(err).ToNot(HaveOccurred()) + Expect(instanceTypes).ShouldNot(BeEmpty()) - // Reconcile the NodeClass to ensure status is updated - ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + getName := func(instanceType *corecloudprovider.InstanceType) string { return instanceType.Name } - // Flush the cache to simulate the controller not having run yet. - // With the instance type controller, SKU API errors happen during - // UpdateInstanceTypes (controller reconcile), not during List. - // When the cache is empty, List returns an error. - azureEnv.InstanceTypesProvider.Reset() + if shouldIncludeD2s { + Expect(instanceTypes).Should(ContainElement(WithTransform(getName, Equal("Standard_D2s_v3"))), + "Standard_D2s_v3 (2 vCPUs) should be included") + } else { + Expect(instanceTypes).ShouldNot(ContainElement(WithTransform(getName, Equal("Standard_D2s_v3"))), + "Standard_D2s_v3 (2 vCPUs) should be excluded") + } - nodeClaim := coretest.NodeClaim(karpv1.NodeClaim{ - ObjectMeta: metav1.ObjectMeta{ - Labels: map[string]string{ - karpv1.NodePoolLabelKey: nodePool.Name, - }, + if shouldIncludeD4s { + Expect(instanceTypes).Should(ContainElement(WithTransform(getName, Equal("Standard_D4s_v3"))), + "Standard_D4s_v3 (4 vCPUs) should be included") + } + }, + Entry("when LocalDNS is required - filters to 4+ vCPUs and 244+ MiB", + v1beta1.LocalDNSModeRequired, "", false, true), + Entry("when LocalDNS is preferred with k8s >= 1.36 - filters to 4+ vCPUs and 244+ MiB", + v1beta1.LocalDNSModePreferred, "1.36.0", false, true), + Entry("when LocalDNS is preferred with k8s < 1.36 - includes all SKUs", + v1beta1.LocalDNSModePreferred, "1.35.0", true, true), + Entry("when LocalDNS is disabled - includes all SKUs", + v1beta1.LocalDNSModeDisabled, "", true, true), + Entry("when LocalDNS is not set - includes all SKUs", + v1beta1.LocalDNSMode(""), "", true, true), + ) + + Context("Cache invalidation with LocalDNS", func() { + It("should return different instance type lists when LocalDNS mode changes", func() { + // First, get instance types with LocalDNS disabled + nodeClassDisabled := test.AKSNodeClass() + nodeClassDisabled.Spec.LocalDNS = &v1beta1.LocalDNS{ + Mode: v1beta1.LocalDNSModeDisabled, + VnetDNSOverrides: []v1beta1.LocalDNSZoneOverride{ + { + Zone: ".", + QueryLogging: v1beta1.LocalDNSQueryLoggingError, + Protocol: v1beta1.LocalDNSProtocolPreferUDP, + ForwardDestination: v1beta1.LocalDNSForwardDestinationVnetDNS, + ForwardPolicy: v1beta1.LocalDNSForwardPolicySequential, + MaxConcurrent: lo.ToPtr(int32(100)), + CacheDuration: karpv1.MustParseNillableDuration("1h"), + ServeStaleDuration: karpv1.MustParseNillableDuration("30m"), + ServeStale: v1beta1.LocalDNSServeStaleVerify, }, - Spec: karpv1.NodeClaimSpec{ - NodeClassRef: &karpv1.NodeClassReference{ - Name: nodeClass.Name, - Group: object.GVK(nodeClass).Group, - Kind: object.GVK(nodeClass).Kind, - }, + { + Zone: "cluster.local", + QueryLogging: v1beta1.LocalDNSQueryLoggingError, + Protocol: v1beta1.LocalDNSProtocolPreferUDP, + ForwardDestination: v1beta1.LocalDNSForwardDestinationClusterCoreDNS, + ForwardPolicy: v1beta1.LocalDNSForwardPolicySequential, + MaxConcurrent: lo.ToPtr(int32(100)), + CacheDuration: karpv1.MustParseNillableDuration("1h"), + ServeStaleDuration: karpv1.MustParseNillableDuration("30m"), + ServeStale: v1beta1.LocalDNSServeStaleVerify, }, - }) - - claim, err := CreateAndWaitForPromises(ctx, cloudProvider, azureEnv, nodeClaim) - Expect(err).To(HaveOccurred()) - Expect(err).To(BeAssignableToTypeOf(&corecloudprovider.CreateError{})) - Expect(claim).To(BeNil()) - Expect(err.Error()).To(ContainSubstring("resolving instance types")) - - // Reset instance types - Expect(azureEnv.InstanceTypesProvider.UpdateInstanceTypes(ctx)).To(Succeed()) - }) - - It("should return error when instance creation fails", func() { - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - - // Create a NodeClaim with valid requirements - nodeClaim := coretest.NodeClaim(karpv1.NodeClaim{ - ObjectMeta: metav1.ObjectMeta{ - Labels: map[string]string{ - karpv1.NodePoolLabelKey: nodePool.Name, - }, + }, + KubeDNSOverrides: []v1beta1.LocalDNSZoneOverride{ + { + Zone: ".", + QueryLogging: v1beta1.LocalDNSQueryLoggingError, + Protocol: v1beta1.LocalDNSProtocolPreferUDP, + ForwardDestination: v1beta1.LocalDNSForwardDestinationClusterCoreDNS, + ForwardPolicy: v1beta1.LocalDNSForwardPolicySequential, + MaxConcurrent: lo.ToPtr(int32(100)), + CacheDuration: karpv1.MustParseNillableDuration("1h"), + ServeStaleDuration: karpv1.MustParseNillableDuration("30m"), + ServeStale: v1beta1.LocalDNSServeStaleVerify, }, - Spec: karpv1.NodeClaimSpec{ - NodeClassRef: &karpv1.NodeClassReference{ - Name: nodeClass.Name, - Group: object.GVK(nodeClass).Group, - Kind: object.GVK(nodeClass).Kind, - }, + { + Zone: "cluster.local", + QueryLogging: v1beta1.LocalDNSQueryLoggingError, + Protocol: v1beta1.LocalDNSProtocolPreferUDP, + ForwardDestination: v1beta1.LocalDNSForwardDestinationClusterCoreDNS, + ForwardPolicy: v1beta1.LocalDNSForwardPolicySequential, + MaxConcurrent: lo.ToPtr(int32(100)), + CacheDuration: karpv1.MustParseNillableDuration("1h"), + ServeStaleDuration: karpv1.MustParseNillableDuration("30m"), + ServeStale: v1beta1.LocalDNSServeStaleVerify, }, - }) + }, + } + nodeClassDisabled.Status.LocalDNSState = lo.ToPtr(v1beta1.LocalDNSStateDisabled) + instanceTypesDisabled, err := azureEnv.InstanceTypesProvider.List(ctx, nodeClassDisabled) + Expect(err).ToNot(HaveOccurred()) - // Set up the instance provider to fail - azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set( - &azcore.ResponseError{ - ErrorCode: sdkerrors.OperationNotAllowed, - RawResponse: &http.Response{ - Body: createSDKErrorBody(sdkerrors.OperationNotAllowed, "Failed to create VM"), - }, + // Now get instance types with LocalDNS required + nodeClassEnabled := test.AKSNodeClass() + nodeClassEnabled.Spec.LocalDNS = &v1beta1.LocalDNS{ + Mode: v1beta1.LocalDNSModeRequired, + VnetDNSOverrides: []v1beta1.LocalDNSZoneOverride{ + { + Zone: ".", + QueryLogging: v1beta1.LocalDNSQueryLoggingError, + Protocol: v1beta1.LocalDNSProtocolPreferUDP, + ForwardDestination: v1beta1.LocalDNSForwardDestinationVnetDNS, + ForwardPolicy: v1beta1.LocalDNSForwardPolicySequential, + MaxConcurrent: lo.ToPtr(int32(100)), + CacheDuration: karpv1.MustParseNillableDuration("1h"), + ServeStaleDuration: karpv1.MustParseNillableDuration("30m"), + ServeStale: v1beta1.LocalDNSServeStaleVerify, }, - ) - - claim, err := CreateAndWaitForPromises(ctx, cloudProvider, azureEnv, nodeClaim) - Expect(err).To(HaveOccurred()) - Expect(err).To(BeAssignableToTypeOf(&corecloudprovider.CreateError{})) - Expect(claim).To(BeNil()) - Expect(err.Error()).To(ContainSubstring("creating instance failed")) - }) - }) - - // Note: this is the most sharable to AKS machine tests. No ties to VM at all. - // Suggestion: share it? Although might need to rework test location/structure for that. - Context("Unavailable Offerings", func() { - It("should not allocate a vm in a zone marked as unavailable", func() { - azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "ZonalAllocationFailure", fake.MakeSKU("Standard_D2_v2"), fakeZone1, karpv1.CapacityTypeSpot) - azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "ZonalAllocationFailure", fake.MakeSKU("Standard_D2_v2"), fakeZone1, karpv1.CapacityTypeOnDemand) - coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1.LabelInstanceTypeStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{"Standard_D2_v2"}}) - - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - node := ExpectScheduled(ctx, env.Client, pod) - Expect(node.Labels[v1.LabelTopologyZone]).ToNot(Equal(fakeZone1)) - Expect(node.Labels[v1.LabelInstanceTypeStable]).To(Equal("Standard_D2_v2")) - }) - It("should handle ZonalAllocationFailed on creating the VM", func() { - azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.Error.Set( - &azcore.ResponseError{ErrorCode: sdkerrors.ZoneAllocationFailed}, - ) - // when ZonalAllocationFailed error is encountered, we block all VM sizes that have >= vCPUs as the VM size for which we encountered the error - expectedUnavailableSKUs := []*skewer.SKU{ { - Name: lo.ToPtr("Standard_D2_v2"), - Size: lo.ToPtr("D2_v2"), - Family: lo.ToPtr("StandardDv2Family"), - Capabilities: &[]compute.ResourceSkuCapabilities{ - { - Name: lo.ToPtr("vCPUs"), - Value: lo.ToPtr("2"), - }, - }, + Zone: "cluster.local", + QueryLogging: v1beta1.LocalDNSQueryLoggingError, + Protocol: v1beta1.LocalDNSProtocolPreferUDP, + ForwardDestination: v1beta1.LocalDNSForwardDestinationClusterCoreDNS, + ForwardPolicy: v1beta1.LocalDNSForwardPolicySequential, + MaxConcurrent: lo.ToPtr(int32(100)), + CacheDuration: karpv1.MustParseNillableDuration("1h"), + ServeStaleDuration: karpv1.MustParseNillableDuration("30m"), + ServeStale: v1beta1.LocalDNSServeStaleVerify, }, + }, + KubeDNSOverrides: []v1beta1.LocalDNSZoneOverride{ { - Name: lo.ToPtr("Standard_D16_v2"), - Size: lo.ToPtr("D16_v2"), - Family: lo.ToPtr("StandardDv2Family"), - Capabilities: &[]compute.ResourceSkuCapabilities{ - { - Name: lo.ToPtr("vCPUs"), - Value: lo.ToPtr("16"), - }, - }, + Zone: ".", + QueryLogging: v1beta1.LocalDNSQueryLoggingError, + Protocol: v1beta1.LocalDNSProtocolPreferUDP, + ForwardDestination: v1beta1.LocalDNSForwardDestinationClusterCoreDNS, + ForwardPolicy: v1beta1.LocalDNSForwardPolicySequential, + MaxConcurrent: lo.ToPtr(int32(100)), + CacheDuration: karpv1.MustParseNillableDuration("1h"), + ServeStaleDuration: karpv1.MustParseNillableDuration("30m"), + ServeStale: v1beta1.LocalDNSServeStaleVerify, }, { - Name: lo.ToPtr("Standard_D32_v2"), - Size: lo.ToPtr("D32_v2"), - Family: lo.ToPtr("StandardDv2Family"), - Capabilities: &[]compute.ResourceSkuCapabilities{ - { - Name: lo.ToPtr("vCPUs"), - Value: lo.ToPtr("32"), - }, - }, + Zone: "cluster.local", + QueryLogging: v1beta1.LocalDNSQueryLoggingError, + Protocol: v1beta1.LocalDNSProtocolPreferUDP, + ForwardDestination: v1beta1.LocalDNSForwardDestinationClusterCoreDNS, + ForwardPolicy: v1beta1.LocalDNSForwardPolicySequential, + MaxConcurrent: lo.ToPtr(int32(100)), + CacheDuration: karpv1.MustParseNillableDuration("1h"), + ServeStaleDuration: karpv1.MustParseNillableDuration("30m"), + ServeStale: v1beta1.LocalDNSServeStaleVerify, }, - } - coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1.LabelInstanceTypeStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{"Standard_D2_v2"}}) + }, + } + nodeClassEnabled.Status.LocalDNSState = lo.ToPtr(v1beta1.LocalDNSStateEnabled) + ExpectApplied(ctx, env.Client, nodeClassEnabled) + instanceTypesEnabled, err := azureEnv.InstanceTypesProvider.List(ctx, nodeClassEnabled) + Expect(err).ToNot(HaveOccurred()) - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod() - ExpectLaunched(ctx, env.Client, cloudProvider, coreProvisioner, pod) - ExpectNotScheduled(ctx, env.Client, pod) + // The lists should be different sizes + Expect(len(instanceTypesEnabled)).To(BeNumerically("<", len(instanceTypesDisabled)), + "LocalDNS Required should filter out small SKUs") - Eventually(func() []*karpv1.NodeClaim { return ExpectNodeClaims(ctx, env.Client) }).To(HaveLen(0)) + getName := func(instanceType *corecloudprovider.InstanceType) string { return instanceType.Name } - By("marking whatever zone was picked as unavailable - for both spot and on-demand") - zone, err := zones.MakeAKSLabelZoneFromVM(&azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM) - Expect(err).ToNot(HaveOccurred()) - for _, skuToCheck := range expectedUnavailableSKUs { - Expect(azureEnv.UnavailableOfferingsCache.IsUnavailable(skuToCheck, zone, karpv1.CapacityTypeSpot)).To(BeTrue()) - Expect(azureEnv.UnavailableOfferingsCache.IsUnavailable(skuToCheck, zone, karpv1.CapacityTypeOnDemand)).To(BeTrue()) - } + // Verify that small SKUs (< 4 vCPUs) are present when disabled but absent when enabled + Expect(instanceTypesDisabled).Should(ContainElement(WithTransform(getName, Equal("Standard_D2s_v3"))), + "Standard_D2s_v3 (2 vCPUs) should be included when LocalDNS is disabled") + Expect(instanceTypesEnabled).ShouldNot(ContainElement(WithTransform(getName, Equal("Standard_D2s_v3"))), + "Standard_D2s_v3 (2 vCPUs) should be excluded when LocalDNS is required") + + // Verify that large SKUs (>= 4 vCPUs) are present in both + Expect(instanceTypesDisabled).Should(ContainElement(WithTransform(getName, Equal("Standard_D4s_v3"))), + "Standard_D4s_v3 (4 vCPUs) should be included when LocalDNS is disabled") + Expect(instanceTypesEnabled).Should(ContainElement(WithTransform(getName, Equal("Standard_D4s_v3"))), + "Standard_D4s_v3 (4 vCPUs) should be included when LocalDNS is required") + }) + }) - By("successfully scheduling in a different zone on retry") - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - node := ExpectScheduled(ctx, env.Client, pod) - Expect(node.Labels[v1.LabelTopologyZone]).ToNot(Equal(zone)) - }) + DescribeTable("Filtering by ArtifactStreaming", + func(artifactStreaming *v1beta1.ArtifactStreaming, shouldIncludeArm64 bool) { + nodeClass.Spec.ArtifactStreaming = artifactStreaming + test.ApplyDefaultStatus(nodeClass, env, testOptions.UseSIG) + ExpectApplied(ctx, env.Client, nodeClass) + instanceTypes, err := azureEnv.InstanceTypesProvider.List(ctx, nodeClass) + Expect(err).ToNot(HaveOccurred()) + Expect(instanceTypes).ShouldNot(BeEmpty()) - DescribeTable("Should not return unavailable offerings", func(azEnv *test.Environment) { - for _, zone := range azEnv.Zones() { - azEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_D2_v2"), zone, karpv1.CapacityTypeSpot) - azEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_D2_v2"), zone, karpv1.CapacityTypeOnDemand) - } - instanceTypes, err := azEnv.InstanceTypesProvider.List(ctx, nodeClass) - Expect(err).ToNot(HaveOccurred()) + getName := func(instanceType *corecloudprovider.InstanceType) string { return instanceType.Name } - seeUnavailable := false - for _, instanceType := range instanceTypes { - if instanceType.Name == "Standard_D2_v2" { - seeUnavailable = true - if azEnv == azureEnv { - Expect(lo.Map(instanceType.Offerings.Available(), func(offering *corecloudprovider.Offering, _ int) string { - return offering.Requirements.Get(v1.LabelTopologyZone).Any() - })).To(ConsistOf(zones.Regional, zones.Regional)) - } else { - Expect(len(instanceType.Offerings.Available())).To(Equal(0)) - } - } else { - Expect(len(instanceType.Offerings.Available())).To(Not(Equal(0))) - } - } - // we should see the unavailable offering in the list - Expect(seeUnavailable).To(BeTrue()) - }, - Entry("zonal", azureEnv), - Entry("non-zonal", azureEnvNonZonal), - ) + if shouldIncludeArm64 { + Expect(instanceTypes).Should(ContainElement(WithTransform(getName, Equal("Standard_D16plds_v5"))), + "ARM64 instance type Standard_D16plds_v5 should be included") + } else { + Expect(instanceTypes).ShouldNot(ContainElement(WithTransform(getName, Equal("Standard_D16plds_v5"))), + "ARM64 instance type Standard_D16plds_v5 should be excluded") + } - It("should launch instances in a different zone than preferred", func() { - azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "ZonalAllocationFailure", fake.MakeSKU("Standard_D2_v2"), fakeZone1, karpv1.CapacityTypeOnDemand) - azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "ZonalAllocationFailure", fake.MakeSKU("Standard_D2_v2"), fakeZone1, karpv1.CapacityTypeSpot) + // AMD64 instance types should always be included regardless of artifact streaming setting + Expect(instanceTypes).Should(ContainElement(WithTransform(getName, Equal("Standard_D2s_v3"))), + "AMD64 instance type Standard_D2s_v3 should always be included") + }, + Entry("when artifact streaming is not set (default) - includes ARM64", + nil, true), + Entry("when artifact streaming is explicitly enabled - excludes ARM64", + &v1beta1.ArtifactStreaming{Enabled: lo.ToPtr(true)}, false), + Entry("when artifact streaming is explicitly disabled - includes ARM64", + &v1beta1.ArtifactStreaming{Enabled: lo.ToPtr(false)}, true), + ) + + Context("Ephemeral Disk", func() { + var originalOptions *options.Options + BeforeEach(func() { + originalOptions = options.FromContext(ctx) + ctx = options.ToContext( + ctx, + test.Options(test.OptionsFields{ + UseSIG: lo.ToPtr(true), + })) - ExpectApplied(ctx, env.Client, nodeClass, nodePool) - pod := coretest.UnschedulablePod(coretest.PodOptions{ - NodeSelector: map[string]string{v1.LabelInstanceTypeStable: "Standard_D2_v2"}, - }) - pod.Spec.Affinity = &v1.Affinity{ - NodeAffinity: &v1.NodeAffinity{ - PreferredDuringSchedulingIgnoredDuringExecution: []v1.PreferredSchedulingTerm{ - { - Weight: 1, - Preference: v1.NodeSelectorTerm{ - MatchExpressions: []v1.NodeSelectorRequirement{ - { - Key: v1.LabelTopologyZone, Operator: v1.NodeSelectorOpIn, Values: []string{fakeZone1}, - }, - }, - }, - }, - }, - }, - } - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - node := ExpectScheduled(ctx, env.Client, pod) - Expect(node.Labels[v1.LabelTopologyZone]).ToNot(Equal(fakeZone1)) - Expect(node.Labels[v1.LabelInstanceTypeStable]).To(Equal("Standard_D2_v2")) - }) - It("should launch smaller instances than optimal if larger instance launch results in Insufficient Capacity Error", func() { - azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_F16s_v2"), fakeZone1, karpv1.CapacityTypeOnDemand) - azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_F16s_v2"), fakeZone1, karpv1.CapacityTypeSpot) - coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1.LabelInstanceTypeStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{"Standard_DS2_v2", "Standard_F16s_v2"}}) - pods := []*v1.Pod{} - for i := 0; i < 2; i++ { - pods = append(pods, coretest.UnschedulablePod(coretest.PodOptions{ - ResourceRequirements: v1.ResourceRequirements{ - Requests: v1.ResourceList{v1.ResourceCPU: resource.MustParse("1")}, - }, - NodeSelector: map[string]string{ - v1.LabelTopologyZone: fakeZone1, - }, - })) - } - // Provisions 2 smaller instances since larger was ICE'd - ExpectApplied(ctx, env.Client, nodeClass, nodePool) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pods...) - - nodeNames := sets.New[string]() - for _, pod := range pods { - node := ExpectScheduled(ctx, env.Client, pod) - Expect(node.Labels[v1.LabelInstanceTypeStable]).To(Equal("Standard_DS2_v2")) - nodeNames.Insert(node.Name) - } - Expect(nodeNames.Len()).To(Equal(2)) - }) - DescribeTable("should launch instances on later reconciliation attempt with Insufficient Capacity Error Cache expiry", - func(azureEnv *test.Environment, cluster *state.Cluster, cloudProvider *cloudprovider.CloudProvider, coreProvisioner *provisioning.Provisioner) { - for _, zone := range azureEnv.Zones() { - azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_D2_v2"), zone, karpv1.CapacityTypeSpot) - azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_D2_v2"), zone, karpv1.CapacityTypeOnDemand) - } - if azureEnv == azureEnvNonZonal { - // Non-zonal environments already use zone="0" as their only zone. - } else { - azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_D2_v2"), zones.Regional, karpv1.CapacityTypeSpot) - azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_D2_v2"), zones.Regional, karpv1.CapacityTypeOnDemand) - } + // Repopilate instance types based on above ctx + Expect(azureEnv.InstanceTypesProvider.UpdateInstanceTypes(ctx)).To(Succeed()) + }) - ExpectApplied(ctx, env.Client, nodeClass, nodePool) - pod := coretest.UnschedulablePod(coretest.PodOptions{ - NodeSelector: map[string]string{v1.LabelInstanceTypeStable: "Standard_D2_v2"}, - }) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectNotScheduled(ctx, env.Client, pod) - // capacity shortage is over - expire the items from the cache and try again - azureEnv.UnavailableOfferingsCache.Flush() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - node := ExpectScheduled(ctx, env.Client, pod) - Expect(node.Labels).To(HaveKeyWithValue(v1.LabelInstanceTypeStable, "Standard_D2_v2")) - }, - Entry("zonal", azureEnv, cluster, cloudProvider, coreProvisioner), - Entry("non-zonal", azureEnvNonZonal, clusterNonZonal, cloudProviderNonZonal, coreProvisionerNonZonal), + AfterEach(func() { + ctx = options.ToContext(ctx, originalOptions) + // Clean up instance types + Expect(azureEnv.InstanceTypesProvider.UpdateInstanceTypes(ctx)).To(Succeed()) + }) + + Context("FindMaxEphemeralSizeGBAndPlacement(sku *skewer.SKU) -> diskSizeGB, *placement", func() { + // B20ms: + // NvmeDiskSizeInMiB == 0 + // CacheDiskBytes == 32212254720 -> 32.21225472 GB .. we should select this as the ephemeral disk size + // placement == CacheDisk + // MaxResourceVolumeMB == 163840 MiB -> 171.80 GB, + // Standard_D128ds_v6: + // NvmeDiskSizeInMiB == 7208960 -> 7559.142441 GB // SupportedEphemeralOSDiskPlacements == NvmeDisk + // and this is greater than 0, so we select 7559, placement == NvmeDisk + // Standard_D16plds_v5: + // NvmeDiskSizeInMiB == 0 + // CacheDiskBytes == 429496729600 -> 429.4967296, this is greater than zero, so we select this as the ephemeral disk size + // placement == CacheDisk and size == 429.4967296 GB + // MaxResourceVolumeMB == 614400 MiB + // Standard_D2as_v6: -> EphemeralOSDiskSupported is false, it should return 0 and nil for placement + // Standard_D128ds_v6: + // NvmeDiskSizeInMiB == 7208960 -> 7559.142441 GB // SupportedEphemeralOSDiskPlacements == NvmeDisk + // and this is greater than 0, so we select 7559, placement == NvmeDisk + // Standard_NC24ads_A100_v4: + // {Name: lo.ToPtr("SupportedEphemeralOSDiskPlacements"), Value: lo.ToPtr("ResourceDisk,CacheDisk")}, + // NvmeDiskSizeInMiB == 915527 -> 959.99964 GB but no SupportedEphemeralOSDiskPlacements == NvmeDisk so we move to cache disk + // CacheDiskBytes == 274877906944 -> 274.877906944 GB so we select cache disk + 274 + // MaxResourceVolumeMB == 65536 MiB + // Standard_D64s_v3: + // NvmeDiskSizeInMiB == 0 + // CacheDiskBytes == 1717986918400 -> 1717.9869184 GB, this is greater than zero, so we select this as the ephemeral disk size + // placement == CacheDisk and size == 1717 GB + // Standard_A0 + // NvmeDiskSizeInMiB == 0 + // CacheDiskBytes == 0, this is zero + // MaxResourceVolumeMB == 20480 Mib -> 21.474836 GB. Note that this sku doesnt support ephemeral os disk + DescribeTable("should return the max ephemeral disk size in GB for a given instance type", + func(sku *skewer.SKU, expectedSize int64, expectedPlacement *armcompute.DiffDiskPlacement) { + sizeGB, placement := instancetype.FindMaxEphemeralSizeGBAndPlacement(sku) + Expect(sizeGB).To(Equal(expectedSize)) + Expect(placement).To(Equal(expectedPlacement)) + }, Entry("Standard_B20ms", fake.MakeSKU("Standard_B20ms"), int64(32), lo.ToPtr(armcompute.DiffDiskPlacementCacheDisk)), + Entry("Standard_D128ds_v6", fake.MakeSKU("Standard_D128ds_v6"), int64(7559), lo.ToPtr(armcompute.DiffDiskPlacementNvmeDisk)), + Entry("Standard_D16plds_v5", fake.MakeSKU("Standard_D16plds_v5"), int64(429), lo.ToPtr(armcompute.DiffDiskPlacementCacheDisk)), + Entry("Standard_D2as_v6", fake.MakeSKU("Standard_D2as_v6"), int64(0), nil), // does not support ephemeral + Entry("Standard_NC24ads_A100_v4", fake.MakeSKU("Standard_NC24ads_A100_v4"), int64(274), lo.ToPtr(armcompute.DiffDiskPlacementCacheDisk)), + Entry("Standard_D64s_v3", fake.MakeSKU("Standard_D64s_v3"), int64(1717), lo.ToPtr(armcompute.DiffDiskPlacementCacheDisk)), + Entry("Standard_A0", fake.MakeSKU("Standard_A0"), int64(0), nil), // does not support ephemeral + Entry("Standard_D2_v2", fake.MakeSKU("Standard_D2_v2"), int64(0), nil), // does not support ephemeral + // TODO: codegen + // Entry("Standard_D2pls_v5", fake.MakeSKU("Standard_D2pls_v5"), int64(0), nil), // does not support ephemeral + // Entry("Standard_D2lds_v5", fake.MakeSKU("Standard_D2lds_v5"), int64(80), armcompute.DiffDiskPlacementResourceDisk), + Entry("Nil SKU", nil, int64(0), nil), ) + }) + Context("Placement", func() { + }) - Context("SkuNotAvailable", func() { - AssertUnavailable := func(sku *skewer.SKU, capacityType string) { - // fake a SKU not available error - azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set( - &azcore.ResponseError{ErrorCode: sdkerrors.SKUNotAvailableErrorCode}, - ) - coretest.ReplaceRequirements(nodePool, - karpv1.NodeSelectorRequirementWithMinValues{ - Key: v1.LabelInstanceTypeStable, Operator: v1.NodeSelectorOpIn, Values: []string{sku.GetName()}}, - karpv1.NodeSelectorRequirementWithMinValues{ - Key: karpv1.CapacityTypeLabelKey, Operator: v1.NodeSelectorOpIn, Values: []string{capacityType}}, - ) - ExpectApplied(ctx, env.Client, nodeClass, nodePool) - pod := coretest.UnschedulablePod() - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectNotScheduled(ctx, env.Client, pod) - for _, zoneID := range []string{"1", "2", "3"} { - ExpectUnavailable(azureEnv, sku, zones.MakeAKSLabelZoneFromARMZone(fake.Region, zoneID), capacityType) - } - } + }) - It("should mark SKU as unavailable in all zones for Spot", func() { - AssertUnavailable(defaultTestSKU, karpv1.CapacityTypeSpot) - }) + Context("Zone-aware provisioning", func() { + It("should not include empty zone domain in instance type offerings", func() { + // Verify that no instance type has an offering with zone="" + // which would introduce a phantom domain in topology spread constraint calculations. + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + instanceTypes, err := cloudProvider.GetInstanceTypes(ctx, nodePool) + Expect(err).ToNot(HaveOccurred()) + Expect(instanceTypes).ToNot(BeEmpty()) - It("should mark SKU as unavailable in all zones for OnDemand", func() { - AssertUnavailable(defaultTestSKU, karpv1.CapacityTypeOnDemand) - }) - }) + for _, it := range instanceTypes { + for _, offering := range it.Offerings { + zone := offering.Requirements.Get(v1.LabelTopologyZone).Any() + Expect(zone).ToNot(BeEmpty(), + fmt.Sprintf("instance type %s has an offering with empty zone, which breaks topology spread constraints", it.Name)) + } + } }) }) + Context("Unavailable Offerings", func() { + DescribeTable("Should not return unavailable offerings", func(azEnv *test.Environment) { + for _, zone := range azEnv.Zones() { + azEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_D2_v2"), zone, karpv1.CapacityTypeSpot) + azEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", fake.MakeSKU("Standard_D2_v2"), zone, karpv1.CapacityTypeOnDemand) + } + instanceTypes, err := azEnv.InstanceTypesProvider.List(ctx, nodeClass) + Expect(err).ToNot(HaveOccurred()) + + seeUnavailable := false + for _, instanceType := range instanceTypes { + if instanceType.Name == "Standard_D2_v2" { + seeUnavailable = true + if azEnv == azureEnv { + Expect(lo.Map(instanceType.Offerings.Available(), func(offering *corecloudprovider.Offering, _ int) string { + return offering.Requirements.Get(v1.LabelTopologyZone).Any() + })).To(ConsistOf(zones.Regional, zones.Regional)) + } else { + Expect(len(instanceType.Offerings.Available())).To(Equal(0)) + } + } else { + Expect(len(instanceType.Offerings.Available())).To(Not(Equal(0))) + } + } + // we should see the unavailable offering in the list + Expect(seeUnavailable).To(BeTrue()) + }, + Entry("zonal", azureEnv), + Entry("non-zonal", azureEnvNonZonal), + ) + }) + Context("Provider List", func() { Context("Filtering in InstanceType", func() { var instanceTypes corecloudprovider.InstanceTypes @@ -2582,364 +834,6 @@ var _ = Describe("InstanceType Provider", func() { } }) - // TODO: Is this stuff really about Provider List? Feels like no, should we put it elsewhere? - type WellKnownLabelEntry struct { - Name string - Label string - ValueFunc func() string - SetupFunc func() - // ExpectedInKubeletLabels indicates if we expect to see this in the KUBELET_NODE_LABELS section of the custom script extension. - // If this is false it means that Karpenter will not set it on the node via KUBELET_NODE_LABELS. - // It does NOT mean that it will not be on the resulting Node object in a real cluster, as it may be written by another process. - // We expect that if ExpectedOnNode is set, ExpectedInKubeletLabels is also set. - ExpectedInKubeletLabels bool - // ExpectedOnNode indicates if we expect to see this on the node. - // If this is false it means is that Karpenter will not set it on the node directly via kube-apiserver. - // It does NOT mean that it will not be on the resulting Node object in a real cluster, as it may be written as part of KUBELET_NODE_LABELS (see above) - // or by another process. We're asserting on this distinction currently because it helps clarify who is doing what - ExpectedOnNode bool - } - - // requireFunc returns a SetupFunc that adds a label requirement to the NodePool - requireFunc := func(key, value string) func() { - return func() { - nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, - karpv1.NodeSelectorRequirementWithMinValues{Key: key, Operator: v1.NodeSelectorOpIn, Values: []string{value}}, - ) - } - } - - // TODO: Is this stuff really about Provider List? Feels like no, should we put it elsewhere? - entries := []WellKnownLabelEntry{ - // Well known - {Name: v1.LabelTopologyRegion, Label: v1.LabelTopologyRegion, ValueFunc: func() string { return fake.Region }, ExpectedInKubeletLabels: true, ExpectedOnNode: true}, - {Name: karpv1.NodePoolLabelKey, Label: karpv1.NodePoolLabelKey, ValueFunc: func() string { return nodePool.Name }, ExpectedInKubeletLabels: true, ExpectedOnNode: true}, - {Name: v1.LabelTopologyZone, Label: v1.LabelTopologyZone, ValueFunc: func() string { return fakeZone1 }, ExpectedInKubeletLabels: true, ExpectedOnNode: true}, - {Name: v1.LabelInstanceTypeStable, Label: v1.LabelInstanceTypeStable, ValueFunc: func() string { return "Standard_NC24ads_A100_v4" }, ExpectedInKubeletLabels: true, ExpectedOnNode: true}, - {Name: v1.LabelOSStable, Label: v1.LabelOSStable, ValueFunc: func() string { return "linux" }, ExpectedInKubeletLabels: true, ExpectedOnNode: true}, - {Name: v1.LabelArchStable, Label: v1.LabelArchStable, ValueFunc: func() string { return "amd64" }, ExpectedInKubeletLabels: true, ExpectedOnNode: true}, - {Name: karpv1.CapacityTypeLabelKey, Label: karpv1.CapacityTypeLabelKey, ValueFunc: func() string { return "on-demand" }, ExpectedInKubeletLabels: true, ExpectedOnNode: true}, - {Name: v1beta1.LabelPlacementScope, Label: v1beta1.LabelPlacementScope, ValueFunc: func() string { return v1beta1.PlacementScopeZonal }, ExpectedInKubeletLabels: true, ExpectedOnNode: true}, - // Well Known to AKS - {Name: v1beta1.LabelSKUName, Label: v1beta1.LabelSKUName, ValueFunc: func() string { return "Standard_NC24ads_A100_v4" }, ExpectedInKubeletLabels: true, ExpectedOnNode: true}, - {Name: v1beta1.LabelSKUFamily, Label: v1beta1.LabelSKUFamily, ValueFunc: func() string { return "N" }, ExpectedInKubeletLabels: true, ExpectedOnNode: true}, - {Name: v1beta1.LabelSKUSeries, Label: v1beta1.LabelSKUSeries, ValueFunc: func() string { return "NCads_v4" }, ExpectedInKubeletLabels: true, ExpectedOnNode: true}, - {Name: v1beta1.LabelSKUVersion, Label: v1beta1.LabelSKUVersion, ValueFunc: func() string { return "4" }, ExpectedInKubeletLabels: true, ExpectedOnNode: true}, - {Name: v1beta1.LabelSKUStorageEphemeralOSMaxSize, Label: v1beta1.LabelSKUStorageEphemeralOSMaxSize, ValueFunc: func() string { return "429" }, ExpectedInKubeletLabels: true, ExpectedOnNode: true}, - {Name: v1beta1.LabelSKUAcceleratedNetworking, Label: v1beta1.LabelSKUAcceleratedNetworking, ValueFunc: func() string { return "true" }, ExpectedInKubeletLabels: true, ExpectedOnNode: true}, - {Name: v1beta1.LabelSKUStoragePremiumCapable, Label: v1beta1.LabelSKUStoragePremiumCapable, ValueFunc: func() string { return "true" }, ExpectedInKubeletLabels: true, ExpectedOnNode: true}, - {Name: v1beta1.LabelSKUGPUName, Label: v1beta1.LabelSKUGPUName, ValueFunc: func() string { return "A100" }, ExpectedInKubeletLabels: true, ExpectedOnNode: true}, - {Name: v1beta1.LabelSKUGPUManufacturer, Label: v1beta1.LabelSKUGPUManufacturer, ValueFunc: func() string { return "nvidia" }, ExpectedInKubeletLabels: true, ExpectedOnNode: true}, - {Name: v1beta1.LabelSKUGPUCount, Label: v1beta1.LabelSKUGPUCount, ValueFunc: func() string { return "1" }, ExpectedInKubeletLabels: true, ExpectedOnNode: true}, - {Name: v1beta1.LabelSKUCPU, Label: v1beta1.LabelSKUCPU, ValueFunc: func() string { return "24" }, ExpectedInKubeletLabels: true, ExpectedOnNode: true}, - {Name: v1beta1.LabelSKUMemory, Label: v1beta1.LabelSKUMemory, ValueFunc: func() string { return "8192" }, ExpectedInKubeletLabels: true, ExpectedOnNode: true}, - // AKS domain - {Name: v1beta1.AKSLabelCPU, Label: v1beta1.AKSLabelCPU, ValueFunc: func() string { return "24" }, ExpectedInKubeletLabels: true, ExpectedOnNode: true}, - {Name: v1beta1.AKSLabelMemory, Label: v1beta1.AKSLabelMemory, ValueFunc: func() string { return "8192" }, ExpectedInKubeletLabels: true, ExpectedOnNode: true}, - {Name: v1beta1.AKSLabelMode + "=user", Label: v1beta1.AKSLabelMode, ValueFunc: func() string { return "user" }, ExpectedInKubeletLabels: true, ExpectedOnNode: true}, - {Name: v1beta1.AKSLabelMode + "=system", Label: v1beta1.AKSLabelMode, ValueFunc: func() string { return "system" }, ExpectedInKubeletLabels: true, ExpectedOnNode: true}, - {Name: v1beta1.AKSLabelScaleSetPriority + "=regular", Label: v1beta1.AKSLabelScaleSetPriority, ValueFunc: func() string { return "regular" }, ExpectedInKubeletLabels: true, ExpectedOnNode: true}, - {Name: v1beta1.AKSLabelScaleSetPriority + "=spot", Label: v1beta1.AKSLabelScaleSetPriority, ValueFunc: func() string { return "spot" }, ExpectedInKubeletLabels: true, ExpectedOnNode: true}, - {Name: v1beta1.AKSLabelPriority + "=regular", Label: v1beta1.AKSLabelPriority, ValueFunc: func() string { return "regular" }, ExpectedInKubeletLabels: true, ExpectedOnNode: true}, - {Name: v1beta1.AKSLabelPriority + "=spot", Label: v1beta1.AKSLabelPriority, ValueFunc: func() string { return "spot" }, ExpectedInKubeletLabels: true, ExpectedOnNode: true}, - {Name: v1beta1.AKSLabelOSSKU, Label: v1beta1.AKSLabelOSSKU, ValueFunc: func() string { return "Ubuntu" }, ExpectedInKubeletLabels: true, ExpectedOnNode: true}, - { - Name: v1beta1.AKSLabelFIPSEnabled, - Label: v1beta1.AKSLabelFIPSEnabled, - // Needs special setup because it only works on FIPS - SetupFunc: func() { - testOptions.UseSIG = true - ctx = options.ToContext(ctx, testOptions) - - nodeClass.Spec.FIPSMode = &v1beta1.FIPSModeFIPS - nodeClass.Spec.ImageFamily = lo.ToPtr(v1beta1.AzureLinuxImageFamily) - test.ApplyDefaultStatus(nodeClass, env, testOptions.UseSIG) - }, - ValueFunc: func() string { return "true" }, - ExpectedInKubeletLabels: true, - ExpectedOnNode: true, - }, - // Deprecated Labels -- note that these are not expected in kubelet labels or on the node. - // They are written by CloudProvider so don't need to be sent to kubelet, and they aren't required on the node object because Karpenter does a mapping from - // the new labels to the old labels for compatibility. - {Name: v1.LabelFailureDomainBetaRegion, Label: v1.LabelFailureDomainBetaRegion, ValueFunc: func() string { return fake.Region }, ExpectedInKubeletLabels: false, ExpectedOnNode: false}, - {Name: v1.LabelFailureDomainBetaZone, Label: v1.LabelFailureDomainBetaZone, ValueFunc: func() string { return fakeZone1 }, ExpectedInKubeletLabels: false, ExpectedOnNode: false}, - {Name: "beta.kubernetes.io/arch", Label: "beta.kubernetes.io/arch", ValueFunc: func() string { return "amd64" }, ExpectedInKubeletLabels: false, ExpectedOnNode: false}, - {Name: "beta.kubernetes.io/os", Label: "beta.kubernetes.io/os", ValueFunc: func() string { return "linux" }, ExpectedInKubeletLabels: false, ExpectedOnNode: false}, - {Name: v1.LabelInstanceType, Label: v1.LabelInstanceType, ValueFunc: func() string { return "Standard_NC24ads_A100_v4" }, ExpectedInKubeletLabels: false, ExpectedOnNode: false}, - {Name: "topology.disk.csi.azure.com/zone", Label: "topology.disk.csi.azure.com/zone", ValueFunc: func() string { return fakeZone1 }, ExpectedInKubeletLabels: false, ExpectedOnNode: false}, - // Unsupported labels - {Name: v1.LabelWindowsBuild, Label: v1.LabelWindowsBuild, ValueFunc: func() string { return "window" }, ExpectedInKubeletLabels: true, ExpectedOnNode: false}, - // Cluster Label - {Name: v1beta1.AKSLabelCluster, Label: v1beta1.AKSLabelCluster, ValueFunc: func() string { return "test-resourceGroup" }, ExpectedInKubeletLabels: true, ExpectedOnNode: true}, - // Previously reserved labels (kubernetes.io/k8s.io domains) that were restricted by Karpenter core before 1.9.x. - // These are now allowed on NodeClaims and synced to the Node by Karpenter, but kubelet cannot set them. - { - Name: "kubernetes.io (previously reserved)", - Label: "kubernetes.io/custom-label", - SetupFunc: requireFunc("kubernetes.io/custom-label", "custom-value"), - ValueFunc: func() string { return "custom-value" }, - ExpectedInKubeletLabels: false, - ExpectedOnNode: true, - }, - { - Name: "k8s.io (previously reserved)", - Label: "k8s.io/custom-label", - SetupFunc: requireFunc("k8s.io/custom-label", "custom-value"), - ValueFunc: func() string { return "custom-value" }, - ExpectedInKubeletLabels: false, - ExpectedOnNode: true, - }, - // kubelet.kubernetes.io is in the kubelet-allowed namespace, so kubelet CAN set these - { - Name: "kubelet.kubernetes.io (kubelet-allowed)", - Label: "kubelet.kubernetes.io/custom-label", - SetupFunc: requireFunc("kubelet.kubernetes.io/custom-label", "custom-value"), - ValueFunc: func() string { return "custom-value" }, - ExpectedInKubeletLabels: true, - ExpectedOnNode: true, - }, - } - - It("should support individual instance type labels (when all pods scheduled at once)", func() { - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - - var podDetails []struct { - pod *v1.Pod - entry WellKnownLabelEntry - } - for _, item := range entries { - if item.SetupFunc != nil { - continue // can't support nonstandard setup here as we're putting all labels on one pod - } - podDetails = append(podDetails, struct { - pod *v1.Pod - entry WellKnownLabelEntry - }{ - pod: coretest.UnschedulablePod(coretest.PodOptions{NodeSelector: map[string]string{item.Label: item.ValueFunc()}}), - entry: item, - }) - } - pods := lo.Map( - podDetails, - func(detail struct { - pod *v1.Pod - entry WellKnownLabelEntry - }, _ int) *v1.Pod { - return detail.pod - }) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pods...) - - // Collect all the VMs we provisioned - vmInputs := map[string]*fake.VirtualMachineCreateOrUpdateInput{} - - for vmInput := range azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.All() { - vmInputs[*vmInput.VM.Name] = vmInput - } - - for _, detail := range podDetails { - key := lo.Keys(detail.pod.Spec.NodeSelector)[0] - node := ExpectScheduled(ctx, env.Client, detail.pod) - if detail.entry.ExpectedOnNode { - Expect(node.Labels[key]).To(Equal(detail.pod.Spec.NodeSelector[key])) - } else { - Expect(node.Labels).ToNot(HaveKey(key)) - } - - // Get the VM creation input and decode custom data - // Extract the vm name from the provider ID - vmName, err := nodeclaimutils.GetVMName(node.Spec.ProviderID) - Expect(err).ToNot(HaveOccurred()) - - vm := vmInputs[vmName].VM - if detail.entry.ExpectedInKubeletLabels { - ExpectKubeletNodeLabelsInCustomData(&vm, detail.entry.Label, detail.entry.ValueFunc()) - } else { - ExpectKubeletNodeLabelsNotInCustomData(&vm, detail.entry.Label, detail.entry.ValueFunc()) - } - } - }) - - DescribeTable( - "should support individual instance type labels (when all pods scheduled individually)", - func(item WellKnownLabelEntry) { - if item.SetupFunc != nil { - item.SetupFunc() - } - - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - value := item.ValueFunc() - - pod := coretest.UnschedulablePod(coretest.PodOptions{NodeSelector: map[string]string{item.Label: value}}) - // Simulate multiple scheduling passes before final binding, this ensures that when real scheduling happens we won't - // end up with a new node for each scheduling attempt - if item.Label != v1.LabelWindowsBuild { // TODO: special case right now as we don't support it - bindings := []Bindings{} - for range 3 { - bindings = append(bindings, ExpectProvisionedNoBinding(ctx, env.Client, clusterBootstrap, cloudProviderBootstrap, coreProvisionerBootstrap, pod)) - } - for i := range len(bindings) { - Expect(lo.Values(bindings[i])).ToNot(BeEmpty()) - Expect(lo.Values(bindings[i])[0].Node.Name).To(Equal(lo.Values(bindings[0])[0].Node.Name), "expected all bindings to have the same node name") - } - } - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - node := ExpectScheduled(ctx, env.Client, pod) - - if item.ExpectedOnNode { - Expect(node.Labels[item.Label]).To(Equal(value)) - } else { - Expect(node.Labels).ToNot(HaveKey(item.Label)) - } - - // Get the VM creation input and decode custom data - Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - vmInput := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - vm := vmInput.VM - if item.ExpectedInKubeletLabels { - ExpectKubeletNodeLabelsInCustomData(&vm, item.Label, value) - } else { - ExpectKubeletNodeLabelsNotInCustomData(&vm, item.Label, value) - } - }, - lo.Map(entries, func(item WellKnownLabelEntry, _ int) TableEntry { - return Entry(item.Name, item) - }), - ) - - DescribeTable( - "should support individual instance type labels (when all pods scheduled individually) on bootstrap API", - func(item WellKnownLabelEntry) { - if item.SetupFunc != nil { - item.SetupFunc() - } - - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - value := item.ValueFunc() - - pod := coretest.UnschedulablePod(coretest.PodOptions{NodeSelector: map[string]string{item.Label: value}}) - // Simulate multiple scheduling passes before final binding, this ensures that when real scheduling happens we won't - // end up with a new node for each scheduling attempt - if item.Label != v1.LabelWindowsBuild { // TODO: special case right now as we don't support it - bindings := []Bindings{} - for range 3 { - bindings = append(bindings, ExpectProvisionedNoBinding(ctx, env.Client, clusterBootstrap, cloudProviderBootstrap, coreProvisionerBootstrap, pod)) - } - for i := range len(bindings) { - Expect(lo.Values(bindings[i])).ToNot(BeEmpty()) - Expect(lo.Values(bindings[i])[0].Node.Name).To(Equal(lo.Values(bindings[0])[0].Node.Name), "expected all bindings to have the same node name") - } - } - ExpectProvisionedAndWaitForPromises(ctx, env.Client, clusterBootstrap, cloudProviderBootstrap, coreProvisionerBootstrap, azureEnvBootstrap, pod) - - node := ExpectScheduled(ctx, env.Client, pod) - - if item.ExpectedOnNode { - Expect(node.Labels[item.Label]).To(Equal(value)) - } else { - Expect(node.Labels).ToNot(HaveKey(item.Label)) - } - - // Get the bootstrap API input - Expect(azureEnvBootstrap.NodeBootstrappingAPI.NodeBootstrappingGetBehavior.CalledWithInput.Len()).To(Equal(1)) - bootstrapInput := azureEnvBootstrap.NodeBootstrappingAPI.NodeBootstrappingGetBehavior.CalledWithInput.Pop() - if item.ExpectedInKubeletLabels { - Expect(bootstrapInput.Params.ProvisionProfile.CustomNodeLabels).To(HaveKeyWithValue(item.Label, value)) - } else { - Expect(bootstrapInput.Params.ProvisionProfile.CustomNodeLabels).ToNot(HaveKeyWithValue(item.Label, value)) - } - }, - lo.Map(entries, func(item WellKnownLabelEntry, _ int) TableEntry { - return Entry(item.Name, item) - }), - ) - - It("entries should cover every WellKnownLabel", func() { - expectedLabels := append(karpv1.WellKnownLabels.UnsortedList(), lo.Keys(karpv1.NormalizedLabels)...) - Expect(lo.Map(entries, func(item WellKnownLabelEntry, _ int) string { return item.Label })).To(ContainElements(expectedLabels)) - }) - - nonSchedulableLabels := map[string]string{ - labels.AKSLabelRole: "agent", - v1beta1.AKSLabelKubeletIdentityClientID: test.Options().KubeletIdentityClientID, - "kubernetes.azure.com/mode": "user", // TODO: Will become a WellKnownLabel soon - //We expect the vnetInfoLabels because we're simulating network plugin Azure by default and they are included there - labels.AKSLabelSubnetName: "aks-subnet", - labels.AKSLabelVNetGUID: test.Options().VnetGUID, - labels.AKSLabelAzureCNIOverlay: strconv.FormatBool(true), - labels.AKSLabelPodNetworkType: consts.NetworkPluginModeOverlay, - karpv1.NodeDoNotSyncTaintsLabelKey: "true", - } - - It("should write other (non-schedulable) labels to kubelet", func() { - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - ExpectScheduled(ctx, env.Client, pod) - - // Not checking on the node as not all these labels are expected there (via Karpenter setting them, they'll get there via kubelet) - - Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - vmInput := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - vm := vmInput.VM - for key, value := range nonSchedulableLabels { - ExpectKubeletNodeLabelsInCustomData(&vm, key, value) - } - }) - - DescribeTable("should not write restricted labels to kubelet, but should write allowed labels", func(domain string, allowed bool) { - nodePool.Spec.Template.Spec.Requirements = []karpv1.NodeSelectorRequirementWithMinValues{ - {Key: domain + "/team", Operator: v1.NodeSelectorOpExists}, - {Key: domain + "/custom-label", Operator: v1.NodeSelectorOpExists}, - {Key: "subdomain." + domain + "/custom-label", Operator: v1.NodeSelectorOpExists}, - } - - nodeSelector := map[string]string{ - domain + "/team": "team-1", - domain + "/custom-label": "custom-value", - "subdomain." + domain + "/custom-label": "custom-value", - } - - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod(coretest.PodOptions{NodeSelector: nodeSelector}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, cluster, cloudProvider, coreProvisioner, azureEnv, pod) - node := ExpectScheduled(ctx, env.Client, pod) - - // Not checking on the node as not all these labels are expected there (via Karpenter setting them, they'll get there via kubelet) - - Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) - vmInput := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop() - vm := vmInput.VM - - // Ensure that the requirements/labels specified above are propagated onto the node and that it didn't do so via kubelet labels - for k, v := range nodeSelector { - Expect(node.Labels).To(HaveKeyWithValue(k, v)) - if allowed { - ExpectKubeletNodeLabelsInCustomData(&vm, k, v) - } else { - ExpectKubeletNodeLabelsNotInCustomData(&vm, k, v) - } - } - }, - Entry("node-restriction.kubernetes.io", "node-restriction.kubernetes.io", false), - Entry("node.kubernetes.io", "node.kubernetes.io", true), - ) - - It("should write other (non-schedulable) labels to kubelet on bootstrap API", func() { - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod(coretest.PodOptions{}) - ExpectProvisionedAndWaitForPromises(ctx, env.Client, clusterBootstrap, cloudProviderBootstrap, coreProvisionerBootstrap, azureEnvBootstrap, pod) - ExpectScheduled(ctx, env.Client, pod) - - // Not checking on the node as not all these labels are expected there (via Karpenter setting them, they'll get there via kubelet) - - Expect(azureEnvBootstrap.NodeBootstrappingAPI.NodeBootstrappingGetBehavior.CalledWithInput.Len()).To(Equal(1)) - bootstrapInput := azureEnvBootstrap.NodeBootstrappingAPI.NodeBootstrappingGetBehavior.CalledWithInput.Pop() - for key, value := range nonSchedulableLabels { - Expect(bootstrapInput.Params.ProvisionProfile.CustomNodeLabels).To(HaveKeyWithValue(key, value)) - } - }) - It("should propagate all values to requirements from skewer", func() { var gpuNode *corecloudprovider.InstanceType var normalNode *corecloudprovider.InstanceType @@ -3045,17 +939,6 @@ func ExpectKubeletFlagsPassed(customData string) string { return customData[strings.Index(customData, "KUBELET_FLAGS=")+len("KUBELET_FLAGS=") : strings.Index(customData, "KUBELET_NODE_LABELS")] } -func ExpectKubeletNodeLabelsPassed(customData string) string { - GinkgoHelper() - startIdx := strings.Index(customData, "KUBELET_NODE_LABELS=") + len("KUBELET_NODE_LABELS=") - endIdx := strings.Index(customData[startIdx:], "\n") - if endIdx == -1 { - // If no newline found, take to the end - return customData[startIdx:] - } - return customData[startIdx : startIdx+endIdx] -} - func ExpectCapacityPodsToMatchMaxPods(instanceTypes []*corecloudprovider.InstanceType, expectedMaxPods int32) { GinkgoHelper() expected := int64(expectedMaxPods) @@ -3067,41 +950,3 @@ func ExpectCapacityPodsToMatchMaxPods(instanceTypes []*corecloudprovider.Instanc Expect(podsCount).To(Equal(expected), "pods capacity does not match expected value") } } - -func ExpectKubeletNodeLabelsInCustomData(vm *armcompute.VirtualMachine, key string, value string) { - GinkgoHelper() - - Expect(vm.Properties).ToNot(BeNil()) - Expect(vm.Properties.OSProfile).ToNot(BeNil()) - Expect(vm.Properties.OSProfile.CustomData).ToNot(BeNil()) - - customData := *vm.Properties.OSProfile.CustomData - Expect(customData).ToNot(BeNil()) - - decodedBytes, err := base64.StdEncoding.DecodeString(customData) - Expect(err).To(Succeed()) - decodedString := string(decodedBytes[:]) - - // Extract and check KUBELET_NODE_LABELS contains the expected label - kubeletNodeLabels := ExpectKubeletNodeLabelsPassed(decodedString) - Expect(kubeletNodeLabels).To(ContainSubstring(fmt.Sprintf("%s=%s", key, value))) -} - -func ExpectKubeletNodeLabelsNotInCustomData(vm *armcompute.VirtualMachine, key string, value string) { - GinkgoHelper() - - Expect(vm.Properties).ToNot(BeNil()) - Expect(vm.Properties.OSProfile).ToNot(BeNil()) - Expect(vm.Properties.OSProfile.CustomData).ToNot(BeNil()) - - customData := *vm.Properties.OSProfile.CustomData - Expect(customData).ToNot(BeNil()) - - decodedBytes, err := base64.StdEncoding.DecodeString(customData) - Expect(err).To(Succeed()) - decodedString := string(decodedBytes[:]) - - // Extract and check KUBELET_NODE_LABELS contains the expected label - kubeletNodeLabels := ExpectKubeletNodeLabelsPassed(decodedString) - Expect(kubeletNodeLabels).ToNot(ContainSubstring(fmt.Sprintf("%s=%s", key, value))) -} From 4c464749304a9e75b7ea60f22d5c4f1d305c8579 Mon Sep 17 00:00:00 2001 From: Robin Deeboonchai Date: Mon, 8 Jun 2026 22:41:55 -0700 Subject: [PATCH 3/4] test: fix reunified CI failures Thread the existing test timeout into the Ginkgo package run and clean up lint issues from the shared test registration changes. Co-Authored-By: Claude Opus 4.7 --- Makefile | 1 + pkg/cloudprovider/suite_features_test.go | 8 ++++---- pkg/cloudprovider/suite_integration_test.go | 7 ++++--- pkg/cloudprovider/suite_offerings_test.go | 1 + pkg/providers/instancetype/suite_test.go | 10 ---------- 5 files changed, 10 insertions(+), 17 deletions(-) diff --git a/Makefile b/Makefile index 53d652ad8..803ff3350 100644 --- a/Makefile +++ b/Makefile @@ -33,6 +33,7 @@ test: ## Run tests -cover -coverprofile=coverage.out -output-dir=. -coverpkg=./pkg/... \ --focus="${FOCUS}" \ --randomize-all \ + --timeout=${TEST_TIMEOUT} \ ./pkg/... deflake: ## Run randomized, racing, code-covered tests to deflake failures diff --git a/pkg/cloudprovider/suite_features_test.go b/pkg/cloudprovider/suite_features_test.go index d9a85bea5..b5b53bd86 100644 --- a/pkg/cloudprovider/suite_features_test.go +++ b/pkg/cloudprovider/suite_features_test.go @@ -59,6 +59,7 @@ import ( nodeclaimutils "github.com/Azure/karpenter-provider-azure/pkg/utils/nodeclaim" ) +//nolint:gocyclo func runFeatureTests(provisionMode provisionModeTestCase) { Context("Create - GPU Workloads + Nodes", func() { It("should schedule non-GPU pod onto the cheapest non-GPU capable node", func() { @@ -2247,7 +2248,6 @@ func runFeatureTests(provisionMode provisionModeTestCase) { Expect(lo.FromPtr(kubeOverride.ServeStale)).To(Equal(armcontainerservice.LocalDNSServeStaleVerify)) }) }) - } } @@ -2448,13 +2448,13 @@ var _ = Describe("CloudProvider", func() { }) It("should not reattempt creation of a vm thats been created before, and also not CSE", func() { - nodeClaim := coretest.NodeClaim(karpv1.NodeClaim{ + testNodeClaim := coretest.NodeClaim(karpv1.NodeClaim{ ObjectMeta: metav1.ObjectMeta{ Labels: map[string]string{karpv1.NodePoolLabelKey: nodePool.Name}, }, Spec: karpv1.NodeClaimSpec{NodeClassRef: &karpv1.NodeClassReference{Name: nodeClass.Name}}, }) - vmName := instance.GenerateResourceName(nodeClaim.Name) + vmName := instance.GenerateResourceName(testNodeClaim.Name) vm := &armcompute.VirtualMachine{ Name: lo.ToPtr(vmName), ID: lo.ToPtr(fake.MkVMID(options.FromContext(ctx).NodeResourceGroup, vmName)), @@ -2469,7 +2469,7 @@ var _ = Describe("CloudProvider", func() { } azureEnv.VirtualMachinesAPI.Instances.Store(lo.FromPtr(vm.ID), *vm) ExpectApplied(ctx, env.Client, nodePool, nodeClass) - _, err := cloudProvider.Create(ctx, nodeClaim) + _, err := cloudProvider.Create(ctx, testNodeClaim) Expect(err).ToNot(HaveOccurred()) ExpectCSENotProvisioned(azureEnv) diff --git a/pkg/cloudprovider/suite_integration_test.go b/pkg/cloudprovider/suite_integration_test.go index 1c190c4b0..e162ca13d 100644 --- a/pkg/cloudprovider/suite_integration_test.go +++ b/pkg/cloudprovider/suite_integration_test.go @@ -230,13 +230,13 @@ func runIntegrationTests(provisionMode provisionModeTestCase) { if !provisionMode.isAKSMachineMode() { // TODO: share this with Machine API mode It("should not reattempt creation of a vm thats been created before", func() { - nodeClaim := coretest.NodeClaim(karpv1.NodeClaim{ + testNodeClaim := coretest.NodeClaim(karpv1.NodeClaim{ ObjectMeta: metav1.ObjectMeta{ Labels: map[string]string{karpv1.NodePoolLabelKey: nodePool.Name}, }, Spec: karpv1.NodeClaimSpec{NodeClassRef: &karpv1.NodeClassReference{Name: nodeClass.Name}}, }) - vmName := instance.GenerateResourceName(nodeClaim.Name) + vmName := instance.GenerateResourceName(testNodeClaim.Name) vm := &armcompute.VirtualMachine{ Name: lo.ToPtr(vmName), ID: lo.ToPtr(fake.MkVMID(options.FromContext(ctx).NodeResourceGroup, vmName)), @@ -251,7 +251,7 @@ func runIntegrationTests(provisionMode provisionModeTestCase) { } azureEnv.VirtualMachinesAPI.Instances.Store(lo.FromPtr(vm.ID), *vm) ExpectApplied(ctx, env.Client, nodePool, nodeClass) - _, err := CreateAndWaitForPromises(ctx, cloudProvider, azureEnv, nodeClaim) + _, err := CreateAndWaitForPromises(ctx, cloudProvider, azureEnv, testNodeClaim) Expect(err).ToNot(HaveOccurred()) }) @@ -368,6 +368,7 @@ func runNodeOverlayCapacityTests(testOptions nodeOverlayCapacityTestOptions) { }) } +//nolint:gocyclo func runUnhappyPathHandlingTests(provisionMode provisionModeTestCase) { Context("Unexpected API Failures", func() { It("should handle create failures - unrecognized error during sync/initial", func() { diff --git a/pkg/cloudprovider/suite_offerings_test.go b/pkg/cloudprovider/suite_offerings_test.go index 51dd19750..7770fc7dc 100644 --- a/pkg/cloudprovider/suite_offerings_test.go +++ b/pkg/cloudprovider/suite_offerings_test.go @@ -54,6 +54,7 @@ import ( "github.com/Azure/skewer" ) +//nolint:gocyclo func runOfferingTests(provisionMode provisionModeTestCase) { Context("Create - Expected Creation Failures", func() { It("should fail to provision when LowPriorityCoresQuota errors are hit, then switch capacity type and succeed", func() { diff --git a/pkg/providers/instancetype/suite_test.go b/pkg/providers/instancetype/suite_test.go index 57a424422..3d8ccccdc 100644 --- a/pkg/providers/instancetype/suite_test.go +++ b/pkg/providers/instancetype/suite_test.go @@ -17,10 +17,8 @@ limitations under the License. package instancetype_test import ( - "bytes" "context" "fmt" - "io" "strings" "testing" @@ -70,10 +68,6 @@ var coreProvisioner, coreProvisionerNonZonal, coreProvisionerBootstrap *provisio var cluster, clusterNonZonal, clusterBootstrap *state.Cluster var cloudProvider, cloudProviderNonZonal, cloudProviderBootstrap *cloudprovider.CloudProvider -var fakeZone1 = zones.MakeAKSLabelZoneFromARMZone(fake.Region, "1") - -var defaultTestSKU = fake.MakeSKU("Standard_D2_v3") - func TestAzure(t *testing.T) { ctx = TestContextWithLogger(t) RegisterFailHandler(Fail) @@ -930,10 +924,6 @@ var _ = Describe("Tax Calculator", func() { }) }) -func createSDKErrorBody(code, message string) io.ReadCloser { - return io.NopCloser(bytes.NewReader([]byte(fmt.Sprintf(`{"error":{"code": "%s", "message": "%s"}}`, code, message)))) -} - func ExpectKubeletFlagsPassed(customData string) string { GinkgoHelper() return customData[strings.Index(customData, "KUBELET_FLAGS=")+len("KUBELET_FLAGS=") : strings.Index(customData, "KUBELET_NODE_LABELS")] From 8af3d809051de2233f38c74945042a96c6ea0cbc Mon Sep 17 00:00:00 2001 From: Robin Deeboonchai Date: Tue, 16 Jun 2026 12:49:29 -0700 Subject: [PATCH 4/4] temp: test move logs --- designs/temp-acceptance-tests-moved.md | 122 +++++++++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100644 designs/temp-acceptance-tests-moved.md diff --git a/designs/temp-acceptance-tests-moved.md b/designs/temp-acceptance-tests-moved.md new file mode 100644 index 000000000..00cc3a10d --- /dev/null +++ b/designs/temp-acceptance-tests-moved.md @@ -0,0 +1,122 @@ +# Wrong-location test moves + +This log tracks tests that were in the wrong module or wrong local test grouping in the original code before this refactor. Each row records the original disposition, the action taken, where its coverage lives now, how coverage changed, and the justification for the final disposition. + +| # | Original disposition | Action | Current disposition | Coverage changes | Justification | +|---:|---|---|---|---|---| +| 1 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Subnet / It: should use the VNET_SUBNET_ID` | Moved; mode-specific | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / Create - Subnet / It: should use the VNET_SUBNET_ID` | No change. | Scriptless NIC subnet ID from cluster SubnetID. | +| 2 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Subnet / It: should produce all required azure cni labels` | Merged into CNI table | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / Create - Subnet / DescribeTable: Azure CNI node labels and agentbaker network plugin / Entry: Azure CNI w Overlay w Cilium` | No change; covered by CNI table. | Covered by AgentBaker network-plugin table. | +| 3 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Subnet / It: should include stateless CNI label for kubernetes 1.34+ set to true` | Moved; mode-specific | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / Create - Subnet / It: should include stateless CNI label for kubernetes 1.34+ set to true` | No change. | Scriptless VM customData stateless-CNI label. | +| 4 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Subnet / It: should include stateless CNI label for kubernetes < 1.34 set to false` | Moved; mode-specific | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / Create - Subnet / It: should include stateless CNI label for kubernetes < 1.34 set to false` | No change. | Scriptless VM customData stateless-CNI label. | +| 5 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Subnet / It: should use the subnet specified in the nodeclass` | Moved; mode-specific | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / Create - Subnet / It: should use the subnet specified in the nodeclass` | No change. | Scriptless NIC subnet ID from NodeClass VNETSubnetID. | +| 6 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / VM Creation Failures / It: should not reattempt creation of a vm thats been created before` | Moved; mode-specific | `pkg/cloudprovider/suite_integration_test.go / runIntegrationTests / Create - CloudProvider Create Error Cases / It: should not reattempt creation of a vm thats been created before` | No change. | Scriptless existing-VM reuse path. | +| 7 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / VM Creation Failures / It: should delete the network interface on failure to create the vm` | Moved; mode-specific | `pkg/cloudprovider/suite_integration_test.go / runIntegrationTests / Create - CloudProvider Create Error Cases / It: should delete the network interface on failure to create the vm` | No change. | Scriptless NIC cleanup after VM-create failure. | +| 8 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / VM Creation Failures / It: should fail to provision when LowPriorityCoresQuota errors are hit, then switch capacity type and succeed`; plus AKS Machine counterpart. | Moved; shared | `pkg/cloudprovider/suite_offerings_test.go / runOfferingTests / Create - Expected Creation Failures / It: should fail to provision when LowPriorityCoresQuota errors are hit, then switch capacity type and succeed` | No change; Scriptless and Machine API counterparts unified. | - | +| 9 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / VM Creation Failures / It: should fail to provision when OverconstrainedZonalAllocation errors are hit, then switch zone and succeed`; plus AKS Machine counterpart. | Moved; shared | `pkg/cloudprovider/suite_offerings_test.go / runOfferingTests / Create - Expected Creation Failures / It: should fail to provision when OverconstrainedZonalAllocation errors are hit, then switch zone and succeed` | No change; Scriptless and Machine API counterparts unified. | - | +| 10 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / VM Creation Failures / It: should fail to provision when OverconstrainedAllocation errors are hit, then switch capacity type and succeed`; plus AKS Machine counterpart. | Moved; shared | `pkg/cloudprovider/suite_offerings_test.go / runOfferingTests / Create - Expected Creation Failures / It: should fail to provision when OverconstrainedAllocation errors are hit, then switch capacity type and succeed` | No change; Scriptless and Machine API counterparts unified. | - | +| 11 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / VM Creation Failures / It: should fail to provision when AllocationFailure errors are hit, then switch placement and succeed`; plus AKS Machine counterpart. | Moved; shared | `pkg/cloudprovider/suite_offerings_test.go / runOfferingTests / Create - Expected Creation Failures / It: should fail to provision when AllocationFailure errors are hit, then switch placement and succeed` | No change; Scriptless and Machine API counterparts unified. | - | +| 12 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / VM Creation Failures / It: should fail to provision when AllocationFailure errors are hit and regional placement is unavailable`; plus AKS Machine counterpart. | Moved; shared | `pkg/cloudprovider/suite_offerings_test.go / runOfferingTests / Create - Expected Creation Failures / It: should fail to provision when AllocationFailure errors are hit and regional placement is unavailable` | No change; Scriptless and Machine API counterparts unified. | - | +| 13 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / VM Creation Failures / It: should fail to provision when VM SKU family vCPU quota exceeded error is returned, and succeed when it is gone`; plus AKS Machine counterpart. | Moved; shared | `pkg/cloudprovider/suite_offerings_test.go / runOfferingTests / Create - Expected Creation Failures / It: should fail to provision when VM SKU family vCPU quota exceeded error is returned, and succeed when it is gone` | No change; Scriptless and Machine API counterparts unified. | - | +| 14 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / VM Creation Failures / It: should fail to provision when VM SKU family vCPU quota limit is zero, and succeed when its gone`; plus AKS Machine counterpart. | Moved; shared | `pkg/cloudprovider/suite_offerings_test.go / runOfferingTests / Create - Expected Creation Failures / It: should fail to provision when VM SKU family vCPU quota limit is zero, and succeed when its gone` | No change; Scriptless and Machine API counterparts unified. | - | +| 15 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / VM Creation Failures / It: should return ICE if Total Regional Cores Quota errors are hit`; plus AKS Machine counterpart. | Moved; shared | `pkg/cloudprovider/suite_offerings_test.go / runOfferingTests / Create - Expected Creation Failures / It: should return ICE if Total Regional Cores Quota errors are hit` | No change; Scriptless and Machine API counterparts unified. | - | +| 16 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / additional-tags / It: should add additional tags to the node`; plus AKS Machine counterpart. | Moved; shared | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / Create - Additional Tags / It: should add additional tags to the node` | No change; Scriptless and Machine API counterparts unified. | - | +| 17 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Ephemeral Disk / Placement / It: should prefer NVMe disk if supported for ephemeral` | Moved; mode-specific | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / Ephemeral Disk / Placement / It: should prefer NVMe disk if supported for ephemeral` | No change. | Scriptless VM OSDisk DiffDiskSettings. | +| 18 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Ephemeral Disk / Placement / It: should not select NVMe ephemeral disk placement if the sku has an nvme disk, supports ephemeral os disk, but doesnt support NVMe placement` | Moved; mode-specific | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / Ephemeral Disk / Placement / It: should not select NVMe ephemeral disk placement if the sku has an nvme disk, supports ephemeral os disk, but doesnt support NVMe placement` | No change. | Scriptless VM OSDisk DiffDiskSettings. | +| 19 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Ephemeral Disk / Placement / It: should prefer cache disk placement when both cache and temp disk support ephemeral and fit the default 128GB threshold` | Moved; mode-specific | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / Ephemeral Disk / Placement / It: should prefer cache disk placement when both cache and temp disk support ephemeral and fit the default 128GB threshold` | No change. | Scriptless VM OSDisk DiffDiskSettings. | +| 20 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Ephemeral Disk / Placement / It: should select managed disk if cache disk is too small but temp disk supports ephemeral and fits osDiskSizeGB to have parity with the AKS Nodepool API` | Moved; mode-specific | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / Ephemeral Disk / Placement / It: should select managed disk if cache disk is too small but temp disk supports ephemeral and fits osDiskSizeGB to have parity with the AKS Nodepool API` | No change. | Scriptless VM OSDisk without DiffDiskSettings. | +| 21 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Ephemeral Disk / It: should use ephemeral disk if supported, and has space of at least 128GB by default`; plus AKS Machine counterpart. | Moved; shared | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / Ephemeral Disk / It: should use ephemeral disk if supported, and has space of at least 128GB by default` | No change; Scriptless and Machine API counterparts unified. | - | +| 22 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Ephemeral Disk / It: should fail to provision if ephemeral disk ask for is too large`; plus AKS Machine counterpart. | Moved; shared | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / Ephemeral Disk / It: should fail to provision if ephemeral disk ask for is too large` | No change; Scriptless and Machine API counterparts unified. | - | +| 23 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Ephemeral Disk / It: should select an ephemeral disk if LabelSKUStorageEphemeralOSMaxSize is set and os disk size fits`; plus AKS Machine counterpart. | Moved; shared | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / Ephemeral Disk / It: should select an ephemeral disk if LabelSKUStorageEphemeralOSMaxSize is set and os disk size fits` | No change; Scriptless and Machine API counterparts unified. | - | +| 24 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Ephemeral Disk / It: should use ephemeral disk if supported, and set disk size to OSDiskSizeGB from node class`; plus AKS Machine counterpart. | Moved; shared | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / Ephemeral Disk / It: should use ephemeral disk if supported, and set disk size to OSDiskSizeGB from node class` | No change; Scriptless and Machine API counterparts unified. | - | +| 25 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Ephemeral Disk / It: should not use ephemeral disk if ephemeral is supported, but we don't have enough space`; plus AKS Machine counterpart. | Moved; shared | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / Ephemeral Disk / It: should not use ephemeral disk if ephemeral is supported, but we don't have enough space` | No change; Scriptless and Machine API counterparts unified. | - | +| 26 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Ephemeral Disk / It: should select NvmeDisk for v6 skus with maxNvmeDiskSize > 0` | Merged into NVMe placement test | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / Ephemeral Disk / Placement / It: should prefer NVMe disk if supported for ephemeral` | No change; covered by NVMe placement test. | Covered by broader NVMe DiffDiskSettings test. | +| 27 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Custom DNS / It: should support provisioning with custom DNS server from options` | Moved; mode-specific | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / Custom DNS / It: should support provisioning with custom DNS server from options` | No change. | Scriptless VM customData cluster-dns flag. | +| 28 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Nodepool with KubeletConfig / It: should support provisioning with kubeletConfig, computeResources and maxPods not specified`; default-network copy with `max-pods: 250`. | Moved; shared | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / Nodepool with KubeletConfig / It: should support provisioning with kubeletConfig, computeResources and maxPods not specified` | Minor: Machine API delegation path added; Scriptless coverage preserved. | Machine API delegates kubeletConfig; Scriptless uses VM customData. | +| 29 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Nodepool with KubeletConfig on a kubenet Cluster / It: should not include cilium or azure cni vnet labels` | Moved; mode-specific | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / Kubenet / It: should not include cilium or azure cni vnet labels` | No change. | Scriptless VM customData omits Azure CNI labels. | +| 30 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Nodepool with KubeletConfig on a kubenet Cluster / It: should support provisioning with kubeletConfig, computeResources and maxPods not specified`; kubenet copy with `max-pods: 110`. | Moved; mode-specific | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / Kubenet / It: should support provisioning with kubeletConfig, computeResources and maxPods not specified` | No change. | Scriptless VM customData kubenet maxPods default. | +| 31 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Nodepool with KubeletConfig on a kubenet Cluster / It: should support provisioning with kubeletConfig, computeResources and maxPods specified`; kubenet copy with `nodeClass.Spec.MaxPods = 15`. | Moved; mode-specific | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / Kubenet / It: should support provisioning with kubeletConfig, computeResources and maxPods specified` | No change. | Scriptless VM customData kubenet maxPods override. | +| 32 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / ImageReference / It: should use shared image gallery images when options are set to UseSIG`; plus AKS Machine counterpart. | Moved; shared | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / ImageReference / It: should use shared image gallery images when options are set to UseSIG` | No change; Scriptless and Machine API counterparts unified. | - | +| 33 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / ImageReference / It: should use Community Images when options are set to UseSIG=false` | Moved; mode-specific | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / ImageReference / It: should use Community Images when options are set to UseSIG=false` | No change. | CIG VM image reference; Machine API does not support CIG. | +| 34 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / ImageProvider + Image Family / DescribeTable: Shared Image Gallery image selection / Entry: Gen2, Gen1 instance type with AKSUbuntu image family`; plus AKS Machine counterpart. | Moved; shared | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / ImageProvider + Image Family / DescribeTable: Shared Image Gallery image selection / Entry: Gen2, Gen1 instance type with AKSUbuntu image family` | No change; Scriptless and Machine API counterparts unified. | - | +| 35 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / ImageProvider + Image Family / DescribeTable: Shared Image Gallery image selection / Entry: Gen1 instance type with AKSUbuntu image family`; plus AKS Machine counterpart. | Moved; shared | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / ImageProvider + Image Family / DescribeTable: Shared Image Gallery image selection / Entry: Gen1 instance type with AKSUbuntu image family` | No change; Scriptless and Machine API counterparts unified. | - | +| 36 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / ImageProvider + Image Family / DescribeTable: Shared Image Gallery image selection / Entry: ARM instance type with AKSUbuntu image family`; plus AKS Machine counterpart. | Moved; shared | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / ImageProvider + Image Family / DescribeTable: Shared Image Gallery image selection / Entry: ARM instance type with AKSUbuntu image family` | No change; Scriptless and Machine API counterparts unified. | - | +| 37 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / ImageProvider + Image Family / DescribeTable: Shared Image Gallery image selection / Entry: Gen2 instance type with AzureLinux image family`; plus AKS Machine counterpart. | Moved; shared | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / ImageProvider + Image Family / It: should select the right Shared Image Gallery image for a given instance type, Gen2 instance type with AzureLinux image family` | No change; Scriptless and Machine API counterparts unified. | - | +| 38 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / ImageProvider + Image Family / DescribeTable: Shared Image Gallery image selection / Entry: Gen1 instance type with AzureLinux image family`; plus AKS Machine counterpart. | Moved; shared | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / ImageProvider + Image Family / It: should select the right Shared Image Gallery image for a given instance type, Gen1 instance type with AzureLinux image family` | No change; Scriptless and Machine API counterparts unified. | - | +| 39 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / ImageProvider + Image Family / DescribeTable: Shared Image Gallery image selection / Entry: ARM instance type with AzureLinux image family`; plus AKS Machine counterpart. | Moved; shared | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / ImageProvider + Image Family / It: should select the right Shared Image Gallery image for a given instance type, ARM instance type with AzureLinux image family` | No change; Scriptless and Machine API counterparts unified. | - | +| 40 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / ImageProvider + Image Family / DescribeTable: Community Gallery image selection / Entry: Gen2, Gen1 instance type with AKSUbuntu image family` | Moved; mode-specific | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / ImageProvider + Image Family / DescribeTable: Community Gallery image selection / Entry: Gen2, Gen1 instance type with AKSUbuntu image family` | No change; table entry moved. | CIG VM image reference; Machine API does not support CIG. | +| 41 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / ImageProvider + Image Family / DescribeTable: Community Gallery image selection / Entry: Gen1 instance type with AKSUbuntu image family` | Moved; mode-specific | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / ImageProvider + Image Family / DescribeTable: Community Gallery image selection / Entry: Gen1 instance type with AKSUbuntu image family` | No change; table entry moved. | CIG VM image reference; Machine API does not support CIG. | +| 42 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / ImageProvider + Image Family / DescribeTable: Community Gallery image selection / Entry: ARM instance type with AKSUbuntu image family` | Moved; mode-specific | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / ImageProvider + Image Family / DescribeTable: Community Gallery image selection / Entry: ARM instance type with AKSUbuntu image family` | No change; table entry moved. | CIG VM image reference; Machine API does not support CIG. | +| 43 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / ImageProvider + Image Family / DescribeTable: Community Gallery image selection / Entry: Gen2 instance type with AzureLinux image family` | Moved; mode-specific | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / ImageProvider + Image Family / DescribeTable: Community Gallery image selection / Entry: Gen2 instance type with AzureLinux image family` | No change; table entry moved. | CIG VM image reference; Machine API does not support CIG. | +| 44 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / ImageProvider + Image Family / DescribeTable: Community Gallery image selection / Entry: Gen1 instance type with AzureLinux image family` | Moved; mode-specific | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / ImageProvider + Image Family / DescribeTable: Community Gallery image selection / Entry: Gen1 instance type with AzureLinux image family` | No change; table entry moved. | CIG VM image reference; Machine API does not support CIG. | +| 45 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / ImageProvider + Image Family / DescribeTable: Community Gallery image selection / Entry: ARM instance type with AzureLinux image family` | Moved; mode-specific | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / ImageProvider + Image Family / DescribeTable: Community Gallery image selection / Entry: ARM instance type with AzureLinux image family` | No change; table entry moved. | CIG VM image reference; Machine API does not support CIG. | +| 46 | `pkg/cloudprovider/suite_drift_test.go / Drift / It: should not fail if nodeClass does not exist`; duplicate blocks. | Merged into shared drift helper | `pkg/cloudprovider/suite_drift_test.go / runDriftTests / Drift / It: should not fail if nodeClass does not exist` | No change; duplicate blocks unified. | Duplicate mode blocks collapsed into shared drift helper. | +| 47 | `pkg/cloudprovider/suite_drift_test.go / Drift / It: should not fail if nodePool does not exist`; duplicate blocks. | Merged into shared drift helper | `pkg/cloudprovider/suite_drift_test.go / runDriftTests / Drift / It: should not fail if nodePool does not exist` | No change; duplicate blocks unified. | Duplicate mode blocks collapsed into shared drift helper. | +| 48 | `pkg/cloudprovider/suite_drift_test.go / Drift / It: should not return drifted if the NodeClaim is valid`; duplicate blocks. | Merged into shared drift helper | `pkg/cloudprovider/suite_drift_test.go / runDriftTests / Drift / It: should not return drifted if the NodeClaim is valid` | No change; duplicate blocks unified. | Duplicate mode blocks collapsed into shared drift helper. | +| 49 | `pkg/cloudprovider/suite_drift_test.go / Drift / It: should error drift if NodeClaim doesn't have provider id`; duplicate blocks. | Merged into shared drift helper | `pkg/cloudprovider/suite_drift_test.go / runDriftTests / Drift / It: should error drift if NodeClaim doesn't have provider id` | No change; duplicate blocks unified. | Duplicate mode blocks collapsed into shared drift helper. | +| 50 | `pkg/cloudprovider/suite_drift_test.go / Node Image Drift / It: should succeed with no drift when nothing changes`; duplicate blocks. | Merged into shared drift helper | `pkg/cloudprovider/suite_drift_test.go / runDriftTests / Node Image Drift / It: should succeed with no drift when nothing changes` | No change; duplicate blocks unified. | Duplicate mode blocks collapsed into shared drift helper. | +| 51 | `pkg/cloudprovider/suite_drift_test.go / Node Image Drift / It: should succeed with no drift when ConditionTypeImagesReady is not true`; duplicate blocks. | Merged into shared drift helper | `pkg/cloudprovider/suite_drift_test.go / runDriftTests / Node Image Drift / It: should succeed with no drift when ConditionTypeImagesReady is not true` | No change; duplicate blocks unified. | Duplicate mode blocks collapsed into shared drift helper. | +| 52 | `pkg/cloudprovider/suite_drift_test.go / Node Image Drift / It: should error when Images are empty`; duplicate blocks. | Merged into shared drift helper | `pkg/cloudprovider/suite_drift_test.go / runDriftTests / Node Image Drift / It: should error when Images are empty` | No change; duplicate blocks unified. | Duplicate mode blocks collapsed into shared drift helper. | +| 53 | `pkg/cloudprovider/suite_drift_test.go / Node Image Drift / It: should trigger drift when the image version changes`; duplicate blocks. | Merged into shared drift helper | `pkg/cloudprovider/suite_drift_test.go / runDriftTests / Node Image Drift / It: should trigger drift when the image version changes` | No change; duplicate blocks unified. | Duplicate mode blocks collapsed into shared drift helper. | +| 54 | `pkg/cloudprovider/suite_drift_test.go / Kubernetes Version / It: should succeed with no drift when nothing changes`; duplicate blocks. | Merged into shared drift helper | `pkg/cloudprovider/suite_drift_test.go / runDriftTests / Kubernetes Version / It: should succeed with no drift when nothing changes` | No change; duplicate blocks unified. | Duplicate mode blocks collapsed into shared drift helper. | +| 55 | `pkg/cloudprovider/suite_drift_test.go / Kubernetes Version / It: should succeed with no drift when KubernetesVersionReady is not true`; duplicate blocks. | Merged into shared drift helper | `pkg/cloudprovider/suite_drift_test.go / runDriftTests / Kubernetes Version / It: should succeed with no drift when KubernetesVersionReady is not true` | No change; duplicate blocks unified. | Duplicate mode blocks collapsed into shared drift helper. | +| 56 | `pkg/cloudprovider/suite_drift_test.go / Kubernetes Version / It: shouldn't error or be drifted when KubernetesVersion is empty`; duplicate blocks. | Merged into shared drift helper | `pkg/cloudprovider/suite_drift_test.go / runDriftTests / Kubernetes Version / It: shouldn't error or be drifted when KubernetesVersion is empty` | No change; duplicate blocks unified. | Duplicate mode blocks collapsed into shared drift helper. | +| 57 | `pkg/cloudprovider/suite_drift_test.go / Kubernetes Version / It: shouldn't error or be drifted when NodeName is missing`; duplicate blocks. | Merged into shared drift helper | `pkg/cloudprovider/suite_drift_test.go / runDriftTests / Kubernetes Version / It: shouldn't error or be drifted when NodeName is missing` | No change; duplicate blocks unified. | Duplicate mode blocks collapsed into shared drift helper. | +| 58 | `pkg/cloudprovider/suite_drift_test.go / Kubernetes Version / It: shouldn't error or be drifted when node is not found`; duplicate blocks. | Merged into shared drift helper | `pkg/cloudprovider/suite_drift_test.go / runDriftTests / Kubernetes Version / It: shouldn't error or be drifted when node is not found` | No change; duplicate blocks unified. | Duplicate mode blocks collapsed into shared drift helper. | +| 59 | `pkg/cloudprovider/suite_drift_test.go / Kubernetes Version / It: shouldn't error or be drifted when node is deleting`; duplicate blocks. | Merged into shared drift helper | `pkg/cloudprovider/suite_drift_test.go / runDriftTests / Kubernetes Version / It: shouldn't error or be drifted when node is deleting` | No change; duplicate blocks unified. | Duplicate mode blocks collapsed into shared drift helper. | +| 60 | `pkg/cloudprovider/suite_drift_test.go / Kubernetes Version / It: should succeed with drift true when KubernetesVersion is new`; duplicate blocks. | Merged into shared drift helper | `pkg/cloudprovider/suite_drift_test.go / runDriftTests / Kubernetes Version / It: should succeed with drift true when KubernetesVersion is new` | No change; duplicate blocks unified. | Duplicate mode blocks collapsed into shared drift helper. | +| 61 | `pkg/cloudprovider/suite_drift_test.go / ProvisionMode = AKSScriptless / Kubelet Client ID / It: should NOT trigger drift if node doesn't have kubelet client ID label` | Kept; mode-specific | `pkg/cloudprovider/suite_drift_test.go / runDriftTests / Kubelet Client ID / It: should NOT trigger drift if node doesn't have kubelet client ID label` | No change; remains Scriptless-only. | Kubelet client ID drift is handled by Machine API. | +| 62 | `pkg/cloudprovider/suite_drift_test.go / ProvisionMode = AKSScriptless / Kubelet Client ID / It: should trigger drift if node kubelet client ID doesn't match options` | Kept; mode-specific | `pkg/cloudprovider/suite_drift_test.go / runDriftTests / Kubelet Client ID / It: should trigger drift if node kubelet client ID doesn't match options` | No change; remains Scriptless-only. | Kubelet client ID drift is handled by Machine API. | +| 63 | `pkg/cloudprovider/suite_drift_test.go / ProvisionMode = AKSScriptless / Static fields / It: should not trigger drift if NodeClass hasn't changed` | Merged into shared drift coverage | `pkg/cloudprovider/suite_drift_test.go / runDriftTests / Drift / It: should not return drifted if the NodeClaim is valid` | Minor: direct no-change `It` removed; shared valid-NodeClaim no-drift spec covers it. | Redundant with valid-NodeClaim no-drift check. | +| 64 | `pkg/cloudprovider/suite_drift_test.go / Static fields / It: should trigger drift if NodeClass subnet changed`; duplicate blocks. | Merged into shared drift helper | `pkg/cloudprovider/suite_drift_test.go / runDriftTests / Static fields / It: should trigger drift if NodeClass subnet changed` | No change; duplicate blocks unified. | Duplicate mode blocks collapsed into shared drift helper. | +| 65 | `pkg/cloudprovider/suite_drift_test.go / Static fields / It: should trigger drift if ImageFamily changed`; duplicate blocks. | Merged into shared drift helper | `pkg/cloudprovider/suite_drift_test.go / runDriftTests / Static fields / It: should trigger drift if ImageFamily changed` | No change; duplicate blocks unified. | Duplicate mode blocks collapsed into shared drift helper. | +| 66 | `pkg/cloudprovider/suite_drift_test.go / ProvisionMode = AKSScriptless / Node Image Drift / It: should trigger drift when the image gallery changes to SIG` | Kept; mode-specific | `pkg/cloudprovider/suite_drift_test.go / runDriftTests / Node Image Drift / It: should trigger drift when the image gallery changes to SIG` | No change; remains Scriptless-only. | SIG gallery drift signal is Scriptless-only. | +| 67 | `pkg/cloudprovider/suite_drift_test.go / ProvisionMode = AKSMachineAPIHeaderBatch / Node Image Drift / It: should trigger drift when DriftAction field is available` | Kept; mode-specific | `pkg/cloudprovider/suite_drift_test.go / runDriftTests / AKS Machine DriftAction / It: should trigger drift when DriftAction field is available` | No change; remains Machine API-only. | Machine API DriftAction field. | +| 68 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Instance Types / It: should support provisioning with no labels` | Merged into basic operations | `pkg/cloudprovider/suite_integration_test.go / runIntegrationTests / It: should be able to handle basic operations` | Minor: direct no-label `It` removed; basic operations covers the scheduling path. | Redundant with basic scheduling path. | +| 69 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Instance Types / It: should have VM identity set` | Moved; mode-specific | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / Create - VM Identity / It: should have VM identity set` | No change. | Scriptless VM managed identity field. | +| 70 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Instance Types / VM Profile / It: should have OS disk and network interface set to auto-delete` | Moved; mode-specific | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / Create - VM Profile / It: should have OS disk and network interface set to auto-delete` | No change. | Scriptless VM and NIC delete options. | +| 71 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Instance Types / VM Profile / It: should not create unneeded secondary ips for azure cni with overlay` | Moved; mode-specific | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / Create - VM Profile / It: should not create unneeded secondary ips for azure cni with overlay` | No change. | Scriptless NIC IPConfiguration count. | +| 72 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Instance Types / It: should schedule non-GPU pod onto the cheapest non-GPU capable node`; plus AKS Machine counterpart. | Moved; shared | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / Create - GPU Workloads + Nodes / It: should schedule non-GPU pod onto the cheapest non-GPU capable node` | No change; Scriptless and Machine API counterparts unified. | - | +| 73 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Instance Types / It: should schedule GPU pod on GPU capable node`; plus AKS Machine counterpart. | Moved; shared | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / Create - GPU Workloads + Nodes / It: should schedule GPU pod on GPU capable node` | No change; Scriptless and Machine API counterparts unified. | - | +| 74 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Bootstrap / It: should include or exclude --keep-terminated-pod-volumes based on kubelet version` | Moved; mode-specific | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / Create - MISC Bootstrap / It: should include or exclude --keep-terminated-pod-volumes based on kubelet version` | No change. | Scriptless VM customData kubelet flag. | +| 75 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Bootstrap / It: should include correct flags and credential provider URL when CredentialProviderURL is not empty` | Moved; mode-specific | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / Create - MISC Bootstrap / It: should include correct flags and credential provider URL when CredentialProviderURL is not empty` | No change. | Scriptless VM customData credential-provider flags. | +| 76 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Bootstrap / It: should include correct flags when CredentialProviderURL is empty` | Moved; mode-specific | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / Create - MISC Bootstrap / It: should include correct flags when CredentialProviderURL is empty` | No change. | Scriptless VM customData ACR credential flag. | +| 77 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Bootstrap / It: should include karpenter.sh/unregistered taint` | Moved; shared | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / Create - Labels and Taints / It: should include karpenter.sh/unregistered taint` | Minor: Machine API taint assertion added; Scriptless coverage preserved. | Adds AKSMachine NodeInitializationTaints assertion. | +| 78 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = BootstrappingClient / It: should provision the node and CSE` | Moved; mode-specific | `pkg/cloudprovider/suite_features_test.go / ProvisionMode = BootstrappingClient / Create - Bootstrap / It: should provision the node and CSE` | No change. | NodeBootstrapping API CSE path. | +| 79 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = BootstrappingClient / It: should not reattempt creation of a vm thats been created before, and also not CSE` | Moved; mode-specific | `pkg/cloudprovider/suite_features_test.go / ProvisionMode = BootstrappingClient / Create - Bootstrap / It: should not reattempt creation of a vm thats been created before, and also not CSE` | No change. | NodeBootstrapping API VM/CSE retry path. | +| 80 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Subnet / DescribeTable: Azure CNI node labels and agentbaker network plugin` | Moved; mode-specific | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / Create - Subnet / DescribeTable: Azure CNI node labels and agentbaker network plugin` | No change; table moved. | Scriptless VM customData AgentBaker CNI labels. | +| 81 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / It: should include loadbalancer backend pools the allocated VMs` | Moved; mode-specific | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / Create - Load Balancer / It: should include loadbalancer backend pools the allocated VMs` | No change. | Scriptless NIC load-balancer backend pools. | +| 82 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Zone-aware provisioning / It: should prefer zonal placement for zone-capable instance types by default` | Moved; shared | `pkg/cloudprovider/suite_offerings_test.go / runOfferingTests / Create - Zone-aware provisioning / It: should prefer zonal placement for zone-capable instance types by default` | Minor: Machine API branch added; Scriptless coverage preserved. | Adds AKSMachine zone assertion for default placement. | +| 83 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Zone-aware provisioning / It: should launch zone-capable instance types regionally when placement scope requires it` | Moved; shared | `pkg/cloudprovider/suite_offerings_test.go / runOfferingTests / Create - Zone-aware provisioning / It: should launch zone-capable instance types regionally when placement scope requires it` | Minor: Machine API branch added; Scriptless coverage preserved. | Adds AKSMachine empty-zone assertion for regional placement. | +| 84 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Zone-aware provisioning / It: should launch in the NodePool-requested zone`; plus AKS Machine counterpart. | Moved; shared | `pkg/cloudprovider/suite_offerings_test.go / runOfferingTests / Create - Zone-aware provisioning / It: should launch in the NodePool-requested zone` | No change; Scriptless and Machine API counterparts unified. | - | +| 85 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Zone-aware provisioning / It: should support provisioning in non-zonal regions`; plus AKS Machine counterpart. | Moved; shared | `pkg/cloudprovider/suite_offerings_test.go / runOfferingTests / Create - Zone-aware provisioning / It: should support provisioning in non-zonal regions` | No change; Scriptless and Machine API counterparts unified. | - | +| 86 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Zone-aware provisioning / It: should provision non-zonal instance types in zonal regions with zone label 0`; plus AKS Machine counterpart. | Moved; shared | `pkg/cloudprovider/suite_offerings_test.go / runOfferingTests / Create - Zone-aware provisioning / It: should support provisioning non-zonal instance types in zonal regions` | No change; title generalized and counterparts unified. | - | +| 87 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Zone-aware provisioning / It: should exclude non-zonal instance types via zone NodePool requirements` | Moved; shared | `pkg/cloudprovider/suite_offerings_test.go / runOfferingTests / Create - Zone-aware provisioning / It: should exclude non-zonal instance types via zone NodePool requirements` | Minor: Machine mode coverage added; Scriptless coverage preserved. | Adds Machine-mode non-zonal exclusion case. | +| 88 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Zone-aware provisioning / It: should exclude non-zonal instance types when all real zones are specified` | Moved; shared | `pkg/cloudprovider/suite_offerings_test.go / runOfferingTests / Create - Zone-aware provisioning / It: should exclude non-zonal instance types when all real zones are specified` | Minor: Machine mode coverage added; Scriptless coverage preserved. | Adds Machine-mode all-real-zones exclusion case. | +| 89 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Zone-aware provisioning / It: should not include empty zone domain in instance type offerings` | Kept | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Zone-aware provisioning / It: should not include empty zone domain in instance type offerings` | No change. | Provider instance-type zone-domain table. | +| 90 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Zone-aware provisioning / It: should schedule pods with zonal topology spread when non-zonal SKUs exist` | Moved; shared | `pkg/cloudprovider/suite_offerings_test.go / runOfferingTests / Create - Zone-aware provisioning / It: should schedule pods with zonal topology spread when non-zonal SKUs exist` | Minor: Machine API branch added; Scriptless coverage preserved. | Adds Machine API topology-spread scheduling case. | +| 91 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / CloudProvider Create Error Cases / It: should return error when NodeClass readiness is Unknown`; plus AKS Machine counterpart. | Moved; shared | `pkg/cloudprovider/suite_integration_test.go / runIntegrationTests / Create - CloudProvider Create Error Cases / It: should return error when NodeClass readiness is Unknown` | No change; Scriptless and Machine API counterparts unified. | - | +| 92 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / CloudProvider Create Error Cases / It: should return error when instance type resolution fails`; plus AKS Machine counterpart. | Moved; shared | `pkg/cloudprovider/suite_integration_test.go / runIntegrationTests / Create - CloudProvider Create Error Cases / It: should return error when instance type resolution fails` | No change; Scriptless and Machine API counterparts unified. | - | +| 93 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / CloudProvider Create Error Cases / It: should return error when instance creation fails`; plus AKS Machine counterpart. | Moved; shared | `pkg/cloudprovider/suite_integration_test.go / runIntegrationTests / Create - CloudProvider Create Error Cases / It: should return error when instance creation fails` | No change; Scriptless and Machine API counterparts unified. | - | +| 94 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Unavailable Offerings / It: should not allocate a vm in a zone marked as unavailable`; plus AKS Machine counterpart. | Moved; shared | `pkg/cloudprovider/suite_offerings_test.go / runOfferingTests / Create - Unavailable Offerings / It: should not allocate an instance in a zone marked as unavailable` | No change; title generalized from VM to instance. | - | +| 95 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Unavailable Offerings / It: should handle ZonalAllocationFailed on creating the VM`; plus AKS Machine counterpart. | Moved; shared | `pkg/cloudprovider/suite_offerings_test.go / runOfferingTests / Create - Unavailable Offerings / It: should handle ZonalAllocationFailed on creating the instance` | Minor: failover/retry coverage preserved; Scriptless no longer asserts first pod remains unscheduled, and cleanup evidence is VM Get calls. | Drops first-pod-unscheduled check; retry coverage remains. | +| 96 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Unavailable Offerings / DescribeTable: Should not return unavailable offerings`; cloudprovider duplicate also existed. | Kept | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Unavailable Offerings / DescribeTable: Should not return unavailable offerings` | No change. | Provider instance-type unavailable-offerings table. | +| 97 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Unavailable Offerings / It: should launch instances in a different zone than preferred`; plus AKS Machine counterpart. | Moved; shared | `pkg/cloudprovider/suite_offerings_test.go / runOfferingTests / Create - Unavailable Offerings / It: should launch instances in a different zone than preferred when zone is unavailable` | No change; title clarified. | - | +| 98 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Unavailable Offerings / It: should launch smaller instances than optimal if larger instance launch results in Insufficient Capacity Error`; plus AKS Machine counterpart. | Moved; shared | `pkg/cloudprovider/suite_offerings_test.go / runOfferingTests / Create - Unavailable Offerings / It: should launch smaller instances than optimal if larger instance launch results in Insufficient Capacity Error` | No change; Scriptless and Machine API counterparts unified. | - | +| 99 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Unavailable Offerings / DescribeTable: should launch instances on later reconciliation attempt with Insufficient Capacity Error Cache expiry`; plus AKS Machine counterpart. | Moved; shared | `pkg/cloudprovider/suite_offerings_test.go / runOfferingTests / Create - Unavailable Offerings / DescribeTable: should launch instances on later reconciliation attempt with Insufficient Capacity Error Cache expiry` | No change; Scriptless and Machine API counterparts unified. | - | +| 100 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Unavailable Offerings / SkuNotAvailable / It: should mark SKU as unavailable in all zones for Spot`; plus AKS Machine counterpart. | Moved; shared | `pkg/cloudprovider/suite_offerings_test.go / runOfferingTests / Create - Unavailable Offerings / SkuNotAvailable / It: should mark SKU as unavailable in all zones for Spot` | No change; Scriptless and Machine API counterparts unified. | - | +| 101 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Unavailable Offerings / SkuNotAvailable / It: should mark SKU as unavailable in all zones for OnDemand`; plus AKS Machine counterpart. | Moved; shared | `pkg/cloudprovider/suite_offerings_test.go / runOfferingTests / Create - Unavailable Offerings / SkuNotAvailable / It: should mark SKU as unavailable in all zones for OnDemand` | No change; Scriptless and Machine API counterparts unified. | - | +| 102 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Labels and Taints / It: should support individual instance type labels (when all pods scheduled at once)` | Moved; shared | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / Create - Labels and Taints / It: should support individual instance type labels when all pods schedule at once` | Minor: Machine API branch added; Scriptless coverage preserved. | Adds AKSMachine Kubernetes.NodeLabels all-pods case. | +| 103 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Labels and Taints / DescribeTable: should support individual instance type labels (when all pods scheduled individually)` | Moved; shared | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / Create - Labels and Taints / DescribeTable: should support individual instance type labels (when all pods scheduled individually)` | Minor: Machine API branch added; Scriptless coverage preserved. | Adds AKSMachine Kubernetes.NodeLabels per-pod table. | +| 104 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = BootstrappingClient / Labels and Taints / DescribeTable: should support individual instance type labels (when all pods scheduled individually) on bootstrap API` | Moved; mode-specific | `pkg/cloudprovider/suite_features_test.go / ProvisionMode = BootstrappingClient / Create - Bootstrap / DescribeTable: should support individual instance type labels (when all pods scheduled individually) on bootstrap API` | Minor: WindowsBuild entry skips three no-binding stability passes. | NodeBootstrapping API skips WindowsBuild no-binding entries. | +| 105 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Labels and Taints / It: should write other (non-schedulable) labels to kubelet` | Moved; shared | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / Create - Labels and Taints / It: should write other (non-schedulable) labels to kubelet` | Minor: Machine API branch added; Scriptless coverage preserved. | Adds AKSMachine Kubernetes.NodeLabels negative checks. | +| 106 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Labels and Taints / DescribeTable: should not write restricted labels to kubelet, but should write allowed labels` | Moved; shared | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / Create - Labels and Taints / DescribeTable: should not write restricted labels to kubelet, but should write allowed labels` | Minor: Machine API branch added; Scriptless coverage preserved. | Adds AKSMachine restricted-label table branch. | +| 107 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = BootstrappingClient / Labels and Taints / It: should write other (non-schedulable) labels to kubelet on bootstrap API` | Moved; mode-specific | `pkg/cloudprovider/suite_features_test.go / ProvisionMode = BootstrappingClient / Create - Bootstrap / It: should write other (non-schedulable) labels to kubelet on bootstrap API` | No change. | NodeBootstrapping API CustomNodeLabels path. | +| 108 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Instance Types / It: should propagate all values to requirements from skewer` | Kept | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Instance Types / It: should propagate all values to requirements from skewer` | No change. | Provider requirement expansion from skewer. | +| 109 | `pkg/cloudprovider/suite_test.go / ProvisionMode = AKSScriptless, ManageExistingAKSMachines=false / It: should list nodeclaim with correct instance type even after capacity error marks offerings unavailable` | Moved; shared | `pkg/cloudprovider/suite_offerings_test.go / runOfferingTests / Create - Unavailable Offerings / It: should list nodeclaim with correct instance type even after capacity error marks offerings unavailable` | Minor: Machine mode coverage added; Scriptless coverage preserved. | Adds AKSMachine-backed NodeClaim listing case. | +| 110 | `pkg/cloudprovider/suite_test.go / ProvisionMode = AKSScriptless, ManageExistingAKSMachines=false / It: should return an ICE error when there are no instance types to launch`; plus AKS Machine counterpart. | Moved; shared | `pkg/cloudprovider/suite_integration_test.go / runIntegrationTests / Create - CloudProvider Create Error Cases / It: should return an ICE error when there are no instance types to launch` | No change; Scriptless duplicate and Machine API counterpart unified. | - | +| 111 | `pkg/cloudprovider/suite_test.go / ProvisionMode = AKSScriptless, ManageExistingAKSMachines=true / It: should return an ICE error when there are no instance types to launch` | Merged into shared create-error spec | `pkg/cloudprovider/suite_integration_test.go / runIntegrationTests / Create - CloudProvider Create Error Cases / It: should return an ICE error when there are no instance types to launch` | No change; duplicate inline copy unified. | Duplicate ICE copy covered by shared create-error spec. | +| 112 | `pkg/cloudprovider/suite_test.go / ProvisionMode = AKSScriptless, ManageExistingAKSMachines=false / AKS Machine API integration / It: should not call writes to AKS Machine API` | Merged into basic operations | `pkg/cloudprovider/suite_integration_test.go / runIntegrationTests / It: should be able to handle basic operations` | Minor: direct no-AKS-Machine-writes `It` removed; basic operations covers create-call expectations. | Redundant with basic create-call expectations. | +| 113 | `pkg/cloudprovider/suite_test.go / ProvisionMode = AKSScriptless, ManageExistingAKSMachines=true / AKS Machine API integration / It: should not call writes to AKS Machine API` | Merged into basic operations | `pkg/cloudprovider/suite_integration_test.go / runIntegrationTests / It: should be able to handle basic operations` | Minor: direct no-AKS-Machine-writes `It` removed; basic operations covers create-call expectations. | Redundant with basic create-call expectations. | +| 114 | `pkg/cloudprovider/suite_test.go / ProvisionMode = AKSScriptless, ManageExistingAKSMachines=false / AKS Machine API integration / AKS Machines Pool Management / It: should handle AKS machines pool not found on each CloudProvider operation` | Merged into basic operations | `pkg/cloudprovider/suite_integration_test.go / runIntegrationTests / It: should be able to handle basic operations` | Minor: direct pool-management `It` removed; basic operations covers VM list/delete path. | Redundant with basic VM list/delete path. | +| 115 | `pkg/cloudprovider/suite_test.go / ProvisionMode = AKSScriptless, ManageExistingAKSMachines=true / AKS Machine API integration / AKS Machines Pool Management / It: should handle AKS machines pool not found on each CloudProvider operation` | Merged into basic operations | `pkg/cloudprovider/suite_integration_test.go / runIntegrationTests / It: should be able to handle basic operations` | Minor: direct pool-management `It` removed; basic operations also checks ManageExisting list call. | Redundant with basic ManageExisting list/delete path. | +| 116 | `pkg/providers/instancetype/suite_test.go / ProvisionMode = AKSScriptless / Labels and Taints / It: entries should cover every WellKnownLabel` | Moved; shared | `pkg/cloudprovider/suite_features_test.go / runFeatureTests / Create - Labels and Taints / It: entries should cover every WellKnownLabel` and `pkg/cloudprovider/suite_features_test.go / ProvisionMode = BootstrappingClient / Create - Bootstrap / It: entries should cover every WellKnownLabel` | No change; matrix assertion moved to both relevant helpers. | - |