Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 20 additions & 1 deletion .github/actions/e2e/create-cluster/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,14 @@ inputs:
description: "the azure vm size to use for the e2e test (set to empty string to allow AKS to default)"
required: false
default: ""
windows:
description: "When 'true', create a dedicated Windows-capable cluster (Azure CNI overlay + windowsProfile) instead of the default Cilium cluster"
required: false
default: "false"
aks_machines_pool_name:
description: "Name of the AKS 'machines' mode agent pool to create (machine API modes only). Windows requires a name <= 6 chars."
required: false
default: "testmpool"
runs:
using: "composite"
steps:
Expand All @@ -50,8 +58,19 @@ runs:
AZURE_VM_SIZE: ${{ inputs.azure_vm_size }}
K8S_VERSION: ${{ inputs.k8s_version }}
PROVISION_MODE: ${{ inputs.provision_mode }}
AKS_MACHINES_POOL_NAME: ${{ inputs.aks_machines_pool_name }}
run: |
if [ "${{ inputs.identity_type }}" = "UserAssigned" ]; then
if [ "${{ inputs.windows }}" = "true" ]; then
echo "Creating dedicated Windows-capable cluster (Azure CNI overlay + windowsProfile)"
# Generate a throwaway Windows admin password for this ephemeral cluster (meets Windows
# complexity: upper, lower, digit, special). Mask it so it never appears in logs. The RP
# sources Windows node admin credentials from the cluster's windowsProfile; the tests
# never use it and the cluster is deleted at the end of the run.
WINDOWS_ADMIN_PASSWORD="Aks$(openssl rand -base64 24 | tr -dc 'A-Za-z0-9' | head -c 24)9!"
echo "::add-mask::$WINDOWS_ADMIN_PASSWORD"
export WINDOWS_ADMIN_PASSWORD
make ci-mkcluster-all-windows
elif [ "${{ inputs.identity_type }}" = "UserAssigned" ]; then
echo "Creating cluster with user-assigned managed identity"
make ci-mkcluster-all-userassigned
else
Expand Down
5 changes: 5 additions & 0 deletions .github/actions/e2e/install-karpenter/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ inputs:
provision_mode:
description: "the Karpenter provisioning mode to run the e2e test in"
default: "aksscriptless"
aks_machines_pool_name:
description: "Name of the AKS 'machines' mode agent pool (machine API modes only). Must match the pool created at cluster-create time."
required: false
default: "testmpool"
runs:
using: "composite"
steps:
Expand All @@ -35,6 +39,7 @@ runs:
AZURE_CLUSTER_NAME: ${{ inputs.cluster_name }}
AZURE_LOCATION: ${{ inputs.location }}
PROVISION_MODE: ${{ inputs.provision_mode }}
AKS_MACHINES_POOL_NAME: ${{ inputs.aks_machines_pool_name }}
AZURE_ACR_NAME: ${{ inputs.acr_name }}
# Redirect Go temp/cache to /mnt/ (Azure temporary disk) which has more disk
# space than the OS disk. 1ES pool VMs can run out of disk on / during large builds.
Expand Down
8 changes: 8 additions & 0 deletions .github/workflows/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,14 @@
1. Create your new E2E testing suite `<test-suite-name>` within the `test/suites/` package. See: `test/README.md`
2. Update the `workflows/e2e-matrix.yaml` workflow to include your E2E test case: `suite: [Utilization, GPU, ...]` - add in the name of your folder within the `test/suites/` package to the comma separated list. Casing does not matter.

> **Note — suites that need a non-default cluster:** most suites run on the shared CI cluster
> (`ci-mkcluster-all`, Azure CNI overlay + Cilium, in the matrix's `provision_mode`). The `Windows`
> suite is special-cased in `workflows/e2e.yaml`: it always runs in `aksmachineapi` mode (Windows is
> only provisionable via the AKS Machine API) on a dedicated cluster (`ci-mkcluster-all-windows`,
> `az-mkaks-windows`) because Windows does not support the Cilium dataplane, and it uses a machines
> pool name `<= 6` chars (`winmp`) to satisfy the Windows machine-name limit. Follow that pattern if a
> new suite needs its own cluster shape or provisioning mode.

### Running the test case

(temporary workflow until we re-enable automation)
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/e2e-matrix.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ jobs:
- Storage
- Subnet
- Utilization
- Windows
permissions:
contents: read
statuses: write
Expand Down
14 changes: 11 additions & 3 deletions .github/workflows/e2e.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ on:
- Storage
- Subnet
- Utilization
- Windows
location:
type: choice
description: "Azure location to run the e2e test in"
Expand Down Expand Up @@ -216,9 +217,14 @@ jobs:
git_ref: ${{ inputs.git_ref }}
location: ${{ inputs.location }}
k8s_version: ${{ inputs.k8s_version }}
provision_mode: ${{ inputs.provision_mode }}
# Windows is only provisionable via the AKS Machine API, so force that mode for the Windows
# suite regardless of the matrix's provision_mode (it would otherwise skip under aksscriptless).
provision_mode: ${{ inputs.suite == 'Windows' && 'aksmachineapi' || inputs.provision_mode }}
identity_type: ${{ inputs.suite == 'Machine' && 'UserAssigned' || 'SystemAssigned' }}
azure_vm_size: ${{ inputs.azure_vm_size }}
# Windows needs a dedicated (non-Cilium) cluster and a machines pool name <= 6 chars.
windows: ${{ inputs.suite == 'Windows' }}
aks_machines_pool_name: ${{ inputs.suite == 'Windows' && 'winmp' || 'testmpool' }}
- name: install karpenter
uses: ./.github/actions/e2e/install-karpenter
with:
Expand All @@ -228,7 +234,8 @@ jobs:
acr_name: ${{ env.ACR_NAME }}
git_ref: ${{ inputs.git_ref }}
location: ${{ inputs.location }}
provision_mode: ${{ inputs.provision_mode }}
provision_mode: ${{ inputs.suite == 'Windows' && 'aksmachineapi' || inputs.provision_mode }}
aks_machines_pool_name: ${{ inputs.suite == 'Windows' && 'winmp' || 'testmpool' }}
- name: run the ${{ inputs.suite }} test suite
env:
AZURE_CLUSTER_NAME: ${{ env.CLUSTER_NAME }}
Expand All @@ -239,7 +246,8 @@ jobs:
AZURE_CLIENT_ID: ${{ secrets.E2E_CLIENT_ID_TEST }}
TEST_SUITE: ${{ inputs.suite }}
GIT_REF: ${{ github.sha }}
PROVISION_MODE: ${{ inputs.provision_mode }}
PROVISION_MODE: ${{ inputs.suite == 'Windows' && 'aksmachineapi' || inputs.provision_mode }}
AKS_MACHINES_POOL_NAME: ${{ inputs.suite == 'Windows' && 'winmp' || 'testmpool' }}
run: |
make az-creds
make e2etests
Expand Down
32 changes: 32 additions & 0 deletions Makefile-az.mk
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,12 @@ KO_BASE_IMAGE_AMD64 ?= mcr.microsoft.com/azurelinux/distroless/base@sha256:301f0
KO_BASE_IMAGE_ARM64 ?= mcr.microsoft.com/azurelinux/distroless/base@sha256:ef54cbe5a632f71090688f45901d073f19f414eb38516a60891ce3dff33c2029
export KOCACHE ?= $(or $(RUNNER_TEMP),/tmp)/ko-cache

# Windows admin credentials, used when creating a Windows-capable test cluster
# (az-mkaks-windows). The AKS RP sources Windows node admin credentials from the cluster's
# windowsProfile, so the cluster must be created with these. Override for non-throwaway use.
WINDOWS_ADMIN_USERNAME ?= azureuser
WINDOWS_ADMIN_PASSWORD ?= Repl@ceMe-W1ndows-E2E!

.DEFAULT_GOAL := help # make without arguments will show help

export KO_GO_PATH ?= hack/go-crossbuild.sh
Expand Down Expand Up @@ -120,6 +126,13 @@ ci-mkcluster-all: az-create-workload-msi az-mkaks-cilium

ci-mkcluster-all-userassigned: az-create-workload-msi az-mkaks-cilium-userassigned az-create-federated-cred $(AZ_ALL_PERMS)

# Windows e2e needs a dedicated cluster: Windows does not support the Cilium dataplane used by the
# default CI cluster, so this uses az-mkaks-windows (Azure CNI overlay + windowsProfile). The Windows
# suite only provisions via the AKS Machine API, so invoke with PROVISION_MODE=aksmachineapi (which
# pulls the machine perms + machines pool into AZ_ALL_PERMS) and AKS_MACHINES_POOL_NAME set to a name
# <= 6 chars (Windows machine-name budget for a custom pool).
ci-mkcluster-all-windows: az-create-workload-msi az-mkaks-windows az-create-federated-cred $(AZ_ALL_PERMS)

ci-install: az-configure-values az-build az-run

# ---------------------------------------------
Expand Down Expand Up @@ -233,6 +246,24 @@ az-mkaks-overlay: az-mkacr ## Create test AKS cluster (with --network-plugin-mod
$(MAKE) az-creds
skaffold config set default-repo $(AZURE_ACR_NAME).$(AZURE_ACR_SUFFIX)/karpenter

az-mkaks-windows: az-mkacr ## Create a Windows-capable test AKS cluster (Azure CNI overlay + windowsProfile) for the Windows e2e suite
@hack/deploy/check-cluster-exists.sh $(AZURE_CLUSTER_NAME) $(AZURE_RESOURCE_GROUP) az-mkaks-windows; \
EXIT_CODE=$$?; \
if [ $$EXIT_CODE -eq 1 ]; then \
az aks create --name $(AZURE_CLUSTER_NAME) --resource-group $(AZURE_RESOURCE_GROUP) --attach-acr $(AZURE_ACR_NAME) \
--enable-managed-identity --node-count 3 --generate-ssh-keys \
--network-plugin azure --network-plugin-mode overlay \
--windows-admin-username $(WINDOWS_ADMIN_USERNAME) --windows-admin-password '$(WINDOWS_ADMIN_PASSWORD)' \
--enable-oidc-issuer --enable-workload-identity --nodepool-taints "CriticalAddonsOnly=true:NoSchedule" \
$(if $(AZURE_VM_SIZE),--node-vm-size $(AZURE_VM_SIZE)) \
$(if $(K8S_VERSION),--kubernetes-version $(K8S_VERSION)) \
--tags "make-command=az-mkaks-windows"; \
elif [ $$EXIT_CODE -eq 2 ]; then \
exit 1; \
fi
$(MAKE) az-creds
skaffold config set default-repo $(AZURE_ACR_NAME).$(AZURE_ACR_SUFFIX)/karpenter

az-mkaks-perftest: az-mkacr ## Create test AKS cluster (with Azure Overlay, larger system pool VMs and larger pod-cidr)
@hack/deploy/check-cluster-exists.sh $(AZURE_CLUSTER_NAME) $(AZURE_RESOURCE_GROUP) az-mkaks-perftest; \
EXIT_CODE=$$?; \
Expand Down Expand Up @@ -318,6 +349,7 @@ az-perm-sig: ## Create role assignments when testing with SIG images
$(eval KARPENTER_USER_ASSIGNED_CLIENT_ID=$(shell az identity show --resource-group "${AZURE_RESOURCE_GROUP}" --name "${AZURE_KARPENTER_USER_ASSIGNED_IDENTITY_NAME}" --query 'principalId' --output tsv))
az role assignment create --assignee-object-id $(KARPENTER_USER_ASSIGNED_CLIENT_ID) --assignee-principal-type "ServicePrincipal" --role "Reader" --scope /subscriptions/$(AZURE_SIG_SUBSCRIPTION_ID)/resourceGroups/AKS-Ubuntu/providers/Microsoft.Compute/galleries/AKSUbuntu
az role assignment create --assignee-object-id $(KARPENTER_USER_ASSIGNED_CLIENT_ID) --assignee-principal-type "ServicePrincipal" --role "Reader" --scope /subscriptions/$(AZURE_SIG_SUBSCRIPTION_ID)/resourceGroups/AKS-AzureLinux/providers/Microsoft.Compute/galleries/AKSAzureLinux
az role assignment create --assignee-object-id $(KARPENTER_USER_ASSIGNED_CLIENT_ID) --assignee-principal-type "ServicePrincipal" --role "Reader" --scope /subscriptions/$(AZURE_SIG_SUBSCRIPTION_ID)/resourceGroups/AKS-Windows/providers/Microsoft.Compute/galleries/AKSWindows

az-perm-subnet-custom: az-perm ## Create role assignments to let Karpenter manage VMs and Network (custom VNet)
$(eval VNET_SUBNET_ID=$(shell az aks show --name $(AZURE_CLUSTER_NAME) --resource-group $(AZURE_RESOURCE_GROUP) --query "agentPoolProfiles[0].vnetSubnetId" --output tsv))
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ Karpenter provider for AKS can be used in two modes:
## Known limitations

The following AKS features are not supported:
* Windows nodes.
* Windows nodes in self-hosted mode. Windows node provisioning is supported only with the AKS Machine API provision mode (used by Node Auto Provisioning); the self-hosted (scriptless) provision mode does not support Windows.
* Kubenet and Calico.
* IPv6 clusters.
* [Service Principal](https://learn.microsoft.com/azure/aks/kubernetes-service-principal) based clusters. A system-assigned or user-assigned managed identity must be used.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1005,6 +1005,8 @@ spec:
- Ubuntu2204
- Ubuntu2404
- AzureLinux
- Windows2022
- Windows2025
type: string
kubelet:
description: |-
Expand Down Expand Up @@ -1680,6 +1682,12 @@ spec:
rule: 'has(self.fipsMode) && self.fipsMode == ''FIPS'' ? (has(self.imageFamily)
&& self.imageFamily != ''Ubuntu2204'' && self.imageFamily != ''Ubuntu2404'')
: true'
- message: FIPS is not supported for Windows image families
rule: '!has(self.fipsMode) || self.fipsMode != ''FIPS'' || !has(self.imageFamily)
|| !(self.imageFamily in [''Windows2022'',''Windows2025''])'
- message: linuxOSConfig is not supported for Windows image families
rule: '!has(self.linuxOSConfig) || !has(self.imageFamily) || !(self.imageFamily
in [''Windows2022'',''Windows2025''])'
- message: kubelet.failSwapOn must be set to false when linuxOSConfig.swapFileSize
is specified
rule: '!has(self.linuxOSConfig) || !has(self.linuxOSConfig.swapFileSize)
Expand Down
9 changes: 6 additions & 3 deletions charts/karpenter/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -239,10 +239,13 @@ spec:
initContainers:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.nodeSelector }}
{{- /* The Karpenter controller is a Linux-only component. Always pin it to Linux
nodes so it is never scheduled onto Windows nodes in hybrid clusters, while
still honoring any additional user-provided nodeSelector entries. */}}
nodeSelector:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- $nodeSelector := merge (dict) (.Values.nodeSelector | default dict) }}
{{- $_ := set $nodeSelector "kubernetes.io/os" "linux" }}
{{- toYaml $nodeSelector | nindent 8 }}
{{- with .Values.affinity }}
# The template below patches the .Values.affinity to add a default label selector where not specified
{{- $_ := include "karpenter.patchAffinity" $ }}
Expand Down
4 changes: 3 additions & 1 deletion charts/karpenter/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,9 @@ dnsConfig: {}
# options:
# - name: ndots
# value: "1"
# -- Node selectors to schedule the pod to nodes with labels.
# -- Node selectors to schedule the pod to nodes with labels. Note: the controller is a
# Linux-only component, so `kubernetes.io/os: linux` is always enforced by the chart (even
# if overridden here) to keep it off Windows nodes in hybrid clusters.
nodeSelector:
kubernetes.io/os: linux
# -- Affinity rules for scheduling the pod. If an explicit label selector is not provided for pod affinity or pod anti-affinity one will be created from the pod selector labels.
Expand Down
8 changes: 8 additions & 0 deletions pkg/apis/crds/karpenter.azure.com_aksnodeclasses.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1005,6 +1005,8 @@ spec:
- Ubuntu2204
- Ubuntu2404
- AzureLinux
- Windows2022
- Windows2025
type: string
kubelet:
description: |-
Expand Down Expand Up @@ -1680,6 +1682,12 @@ spec:
rule: 'has(self.fipsMode) && self.fipsMode == ''FIPS'' ? (has(self.imageFamily)
&& self.imageFamily != ''Ubuntu2204'' && self.imageFamily != ''Ubuntu2404'')
: true'
- message: FIPS is not supported for Windows image families
rule: '!has(self.fipsMode) || self.fipsMode != ''FIPS'' || !has(self.imageFamily)
|| !(self.imageFamily in [''Windows2022'',''Windows2025''])'
- message: linuxOSConfig is not supported for Windows image families
rule: '!has(self.linuxOSConfig) || !has(self.imageFamily) || !(self.imageFamily
in [''Windows2022'',''Windows2025''])'
- message: kubelet.failSwapOn must be set to false when linuxOSConfig.swapFileSize
is specified
rule: '!has(self.linuxOSConfig) || !has(self.linuxOSConfig.swapFileSize)
Expand Down
4 changes: 3 additions & 1 deletion pkg/apis/v1beta1/aksnodeclass.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ func (a *ArtifactStreaming) IsEnabled(arch string) bool {
// AKSNodeClassSpec is the top level specification for the AKS Karpenter Provider.
// This will contain configuration necessary to launch instances in AKS.
// +kubebuilder:validation:XValidation:message="FIPS is not yet supported for Ubuntu2204 or Ubuntu2404",rule="has(self.fipsMode) && self.fipsMode == 'FIPS' ? (has(self.imageFamily) && self.imageFamily != 'Ubuntu2204' && self.imageFamily != 'Ubuntu2404') : true"
// +kubebuilder:validation:XValidation:message="FIPS is not supported for Windows image families",rule="!has(self.fipsMode) || self.fipsMode != 'FIPS' || !has(self.imageFamily) || !(self.imageFamily in ['Windows2022','Windows2025'])"
// +kubebuilder:validation:XValidation:message="linuxOSConfig is not supported for Windows image families",rule="!has(self.linuxOSConfig) || !has(self.imageFamily) || !(self.imageFamily in ['Windows2022','Windows2025'])"
// +kubebuilder:validation:XValidation:message="kubelet.failSwapOn must be set to false when linuxOSConfig.swapFileSize is specified",rule="!has(self.linuxOSConfig) || !has(self.linuxOSConfig.swapFileSize) || (has(self.kubelet) && has(self.kubelet.failSwapOn) && self.kubelet.failSwapOn == false)"
type AKSNodeClassSpec struct {
// vnetSubnetID is the subnet used by nics provisioned with this nodeclass.
Expand All @@ -79,7 +81,7 @@ type AKSNodeClassSpec struct {
ImageID *string `json:"-"`
// imageFamily is the image family that instances use.
// +default="Ubuntu"
// +kubebuilder:validation:Enum:={Ubuntu,Ubuntu2204,Ubuntu2404,AzureLinux}
// +kubebuilder:validation:Enum:={Ubuntu,Ubuntu2204,Ubuntu2404,AzureLinux,Windows2022,Windows2025}
// +optional
ImageFamily *string `json:"imageFamily,omitempty"`
// fipsMode controls FIPS compliance for the provisioned nodes
Expand Down
Loading