Skip to content

Commit

Permalink
chore: Add garbage collection controller for Machines (aws#3405)
Browse files Browse the repository at this point in the history
  • Loading branch information
jonathan-innis authored Feb 17, 2023
1 parent 2092a18 commit eff97c3
Show file tree
Hide file tree
Showing 12 changed files with 1,000 additions and 33 deletions.
2 changes: 2 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ HELM_OPTS ?= --set serviceAccount.annotations.eks\\.amazonaws\\.com/role-arn=${K
--set settings.aws.clusterEndpoint=${CLUSTER_ENDPOINT} \
--set settings.aws.defaultInstanceProfile=KarpenterNodeInstanceProfile-${CLUSTER_NAME} \
--set settings.aws.interruptionQueueName=${CLUSTER_NAME} \
--set settings.featureGates.driftEnabled=true \
--create-namespace

# CR for local builds of Karpenter
Expand Down Expand Up @@ -48,6 +49,7 @@ run: ## Run Karpenter controller binary against your local cluster
--from-literal=aws.clusterEndpoint=${CLUSTER_ENDPOINT} \
--from-literal=aws.defaultInstanceProfile=KarpenterNodeInstanceProfile-${CLUSTER_NAME} \
--from-literal=aws.interruptionQueueName=${CLUSTER_NAME} \
--from-literal=featureGates.driftEnabled=true \
--dry-run=client -o yaml | kubectl apply -f -


Expand Down
2 changes: 1 addition & 1 deletion charts/karpenter/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ helm upgrade --install --namespace karpenter --create-namespace \
| controller.sidecarVolumeMounts | list | `[]` | Additional volumeMounts for the sidecar - this will be added to the volume mounts on top of extraVolumeMounts |
| dnsConfig | object | `{}` | Configure DNS Config for the pod |
| dnsPolicy | string | `"Default"` | Configure the DNS Policy for the pod |
| extraObjects | list | `[]` | Array of extra K8s manifests to deploy |
| extraVolumes | list | `[]` | Additional volumes for the pod. |
| extraObjects | list | `[]` | Array of extra K8s manifests to deploy. |
| fullnameOverride | string | `""` | Overrides the chart's computed fullname. |
| hostNetwork | bool | `false` | Bind the pod to the host network. This is required when using a custom CNI. |
| imagePullPolicy | string | `"IfNotPresent"` | Image pull policy for Docker images. |
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ require (
github.com/Pallinder/go-randomdata v1.2.0
github.com/avast/retry-go v3.0.0+incompatible
github.com/aws/aws-sdk-go v1.44.195
github.com/aws/karpenter-core v0.24.1-0.20230214183652-19517662d6e0
github.com/aws/karpenter-core v0.24.1-0.20230215232304-33f74ed4d625
github.com/go-playground/validator/v10 v10.11.2
github.com/imdario/mergo v0.3.13
github.com/mitchellh/hashstructure/v2 v2.0.2
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,8 @@ github.com/avast/retry-go v3.0.0+incompatible h1:4SOWQ7Qs+oroOTQOYnAHqelpCO0biHS
github.com/avast/retry-go v3.0.0+incompatible/go.mod h1:XtSnn+n/sHqQIpZ10K1qAevBhOOCWBLXXy3hyiqqBrY=
github.com/aws/aws-sdk-go v1.44.195 h1:d5xFL0N83Fpsq2LFiHgtBUHknCRUPGHdOlCWt/jtOJs=
github.com/aws/aws-sdk-go v1.44.195/go.mod h1:aVsgQcEevwlmQ7qHE9I3h+dtQgpqhFB+i8Phjh7fkwI=
github.com/aws/karpenter-core v0.24.1-0.20230214183652-19517662d6e0 h1:JufeGqH7NtwU7Bv5+zdebnMHKmtGv/0eDkkJtr5aX+s=
github.com/aws/karpenter-core v0.24.1-0.20230214183652-19517662d6e0/go.mod h1:gCI5P23KEa095lVX70KAJDs5M3fMwEif6rRZXJcm2ME=
github.com/aws/karpenter-core v0.24.1-0.20230215232304-33f74ed4d625 h1:AJz17PfL71101TKV0e6yuqV7lAFXCMH3ap/qQoVbLiI=
github.com/aws/karpenter-core v0.24.1-0.20230215232304-33f74ed4d625/go.mod h1:gCI5P23KEa095lVX70KAJDs5M3fMwEif6rRZXJcm2ME=
github.com/benbjohnson/clock v1.1.0 h1:Q92kusRqC1XV2MjkWETPvjJVqKetz1OzxZB7mHJLju8=
github.com/benbjohnson/clock v1.1.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA=
github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q=
Expand Down
7 changes: 6 additions & 1 deletion pkg/cloudprovider/cloudprovider.go
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,12 @@ func (c *CloudProvider) List(ctx context.Context) ([]*v1alpha5.Machine, error) {
}

func (c *CloudProvider) Get(ctx context.Context, providerID string) (*v1alpha5.Machine, error) {
instance, err := c.instanceProvider.Get(ctx, lo.Must(utils.ParseInstanceID(providerID)))
id, err := utils.ParseInstanceID(providerID)
if err != nil {
return nil, fmt.Errorf("getting instance ID, %w", err)
}
ctx = logging.WithLogger(ctx, logging.FromContext(ctx).With("id", id))
instance, err := c.instanceProvider.Get(ctx, id)
if err != nil {
return nil, fmt.Errorf("getting instance, %w", err)
}
Expand Down
106 changes: 106 additions & 0 deletions pkg/controllers/machine/garbagecollect/controller.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
/*
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package garbagecollect

import (
"context"
"fmt"
"time"

"github.com/samber/lo"
"go.uber.org/multierr"
v1 "k8s.io/api/core/v1"
"k8s.io/client-go/util/workqueue"
"knative.dev/pkg/logging"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/manager"
"sigs.k8s.io/controller-runtime/pkg/reconcile"

"github.com/aws/karpenter-core/pkg/apis/v1alpha5"
corecloudprovider "github.com/aws/karpenter-core/pkg/cloudprovider"
"github.com/aws/karpenter-core/pkg/operator/controller"
"github.com/aws/karpenter-core/pkg/utils/sets"
"github.com/aws/karpenter/pkg/cloudprovider"
)

type Controller struct {
kubeClient client.Client
cloudProvider *cloudprovider.CloudProvider
}

func NewController(kubeClient client.Client, cloudProvider *cloudprovider.CloudProvider) *Controller {
return &Controller{
kubeClient: kubeClient,
cloudProvider: cloudProvider,
}
}

func (c *Controller) Name() string {
return "machine.garbagecollection"
}

func (c *Controller) Reconcile(ctx context.Context, _ reconcile.Request) (reconcile.Result, error) {
machineList := &v1alpha5.MachineList{}
if err := c.kubeClient.List(ctx, machineList); err != nil {
return reconcile.Result{}, err
}
nodeList := &v1.NodeList{}
if err := c.kubeClient.List(ctx, nodeList); err != nil {
return reconcile.Result{}, err
}
resolvedMachines := lo.Filter(machineList.Items, func(m v1alpha5.Machine, _ int) bool {
return m.Status.ProviderID != ""
})
resolvedProviderIDs := sets.New[string](lo.Map(resolvedMachines, func(m v1alpha5.Machine, _ int) string {
return m.Status.ProviderID
})...)
retrieved, err := c.cloudProvider.List(ctx)
if err != nil {
return reconcile.Result{}, fmt.Errorf("listing cloudprovider machines, %w", err)
}
managedRetrieved := lo.Filter(retrieved, func(m *v1alpha5.Machine, _ int) bool {
return m.Labels[v1alpha5.ManagedByLabelKey] != ""
})
errs := make([]error, len(retrieved))
workqueue.ParallelizeUntil(ctx, 20, len(managedRetrieved), func(i int) {
if !resolvedProviderIDs.Has(managedRetrieved[i].Status.ProviderID) && managedRetrieved[i].CreationTimestamp.Add(time.Minute).Before(time.Now()) {
errs[i] = c.garbageCollect(ctx, managedRetrieved[i], nodeList)
}
})
return reconcile.Result{RequeueAfter: time.Minute * 5}, multierr.Combine(errs...)
}

func (c *Controller) garbageCollect(ctx context.Context, machine *v1alpha5.Machine, nodeList *v1.NodeList) error {
ctx = logging.WithLogger(ctx, logging.FromContext(ctx).With("provider-id", machine.Status.ProviderID))
if err := c.cloudProvider.Delete(ctx, machine); err != nil {
return corecloudprovider.IgnoreMachineNotFoundError(err)
}
logging.FromContext(ctx).Debugf("garbage collected cloudprovider machine")

// Go ahead and cleanup the node if we know that it exists to make scheduling go quicker
if node, ok := lo.Find(nodeList.Items, func(n v1.Node) bool {
return n.Spec.ProviderID == machine.Status.ProviderID
}); ok {
if err := c.kubeClient.Delete(ctx, &node); err != nil {
return client.IgnoreNotFound(err)
}
logging.FromContext(ctx).With("node", node.Name).Debugf("garbage collected node")
}
return nil
}

func (c *Controller) Builder(_ context.Context, m manager.Manager) controller.Builder {
return controller.NewSingletonManagedBy(m)
}
Loading

0 comments on commit eff97c3

Please sign in to comment.