Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 1bb1352

Browse files
authoredMay 23, 2024
Merge pull request #5 from smartnews/fix-extra-node
chore: ignore handled pending pods
2 parents a172372 + 5297908 commit 1bb1352

File tree

3 files changed

+59
-4
lines changed

3 files changed

+59
-4
lines changed
 

‎pkg/controllers/provisioning/provisioner.go

+36-4
Original file line numberDiff line numberDiff line change
@@ -310,20 +310,27 @@ func (p *Provisioner) Schedule(ctx context.Context) (scheduler.Results, error) {
310310
return scheduler.Results{}, err
311311
}
312312
pods := append(pendingPods, deletingNodePods...)
313+
// filter pods which are alredy handled in last 3 minute
314+
targetPods := lo.FilterMap(pods, func(pod *v1.Pod, _ int) (*v1.Pod, bool) {
315+
if p.isPodHandled(ctx, pod) {
316+
return nil, false
317+
}
318+
return pod, true
319+
})
313320
// nothing to schedule, so just return success
314-
if len(pods) == 0 {
321+
if len(targetPods) == 0 {
315322
return scheduler.Results{}, nil
316323
}
317-
s, err := p.NewScheduler(ctx, pods, nodes.Active())
324+
s, err := p.NewScheduler(ctx, targetPods, nodes.Active())
318325
if err != nil {
319326
if errors.Is(err, ErrNodePoolsNotFound) {
320327
logging.FromContext(ctx).Info(ErrNodePoolsNotFound)
321328
return scheduler.Results{}, nil
322329
}
323330
return scheduler.Results{}, fmt.Errorf("creating scheduler, %w", err)
324331
}
325-
results := s.Solve(ctx, pods).TruncateInstanceTypes(scheduler.MaxInstanceTypes)
326-
logging.FromContext(ctx).With("pods", pretty.Slice(lo.Map(pods, func(p *v1.Pod, _ int) string { return client.ObjectKeyFromObject(p).String() }), 5)).
332+
results := s.Solve(ctx, targetPods).TruncateInstanceTypes(scheduler.MaxInstanceTypes)
333+
logging.FromContext(ctx).With("pods", pretty.Slice(lo.Map(targetPods, func(p *v1.Pod, _ int) string { return client.ObjectKeyFromObject(p).String() }), 5)).
327334
With("duration", time.Since(start)).
328335
Infof("found provisionable pod(s)")
329336
results.Record(ctx, p.recorder, p.cluster)
@@ -419,6 +426,31 @@ func (p *Provisioner) Validate(ctx context.Context, pod *v1.Pod) error {
419426
)
420427
}
421428

429+
func (p *Provisioner) isPodHandled(ctx context.Context, pod *v1.Pod) bool {
430+
var events v1.EventList
431+
filter := client.MatchingFields{
432+
"namespace": pod.Namespace,
433+
"involvedObject.kind": "Pod",
434+
"involvedObject.name": pod.Name,
435+
"reason": "HandledByKarpenter",
436+
}
437+
logging.FromContext(ctx).Debugf("get event for %s/%s", pod.Namespace, pod.Name)
438+
if err := p.kubeClient.List(ctx, &events, filter); err == nil {
439+
for _, event := range events.Items {
440+
logging.FromContext(ctx).Debugf("found event %s/%s", pod.Namespace, event.Name)
441+
// ignore the pod if it's already handled in 3 minute
442+
if time.Now().Before(event.LastTimestamp.Time.Add(3 * time.Minute)) {
443+
logging.FromContext(ctx).Infof("pod %s/%s is handled", pod.Namespace, pod.Name)
444+
return true
445+
}
446+
}
447+
} else {
448+
logging.FromContext(ctx).Errorf("failed to get event for %s/%s: %w", pod.Namespace, pod.Name, err)
449+
}
450+
p.recorder.Publish(scheduler.PodHandledEvent(pod))
451+
return false
452+
}
453+
422454
// validateKarpenterManagedLabelCanExist provides a more clear error message in the event of scheduling a pod that specifically doesn't
423455
// want to run on a Karpenter node (e.g. a Karpenter controller replica).
424456
func validateKarpenterManagedLabelCanExist(p *v1.Pod) error {

‎pkg/controllers/provisioning/scheduling/events.go

+11
Original file line numberDiff line numberDiff line change
@@ -59,3 +59,14 @@ func PodFailedToScheduleEvent(pod *v1.Pod, err error) events.Event {
5959
DedupeTimeout: 5 * time.Minute,
6060
}
6161
}
62+
63+
func PodHandledEvent(pod *v1.Pod) events.Event {
64+
return events.Event{
65+
InvolvedObject: pod,
66+
Type: v1.EventTypeNormal,
67+
Reason: "HandledByKarpenter",
68+
Message: "Pod is handled by karpenter",
69+
DedupeValues: []string{string(pod.UID)},
70+
DedupeTimeout: 5 * time.Minute,
71+
}
72+
}

‎pkg/operator/operator.go

+12
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,18 @@ func NewOperator() (context.Context, *Operator) {
187187
lo.Must0(mgr.GetFieldIndexer().IndexField(ctx, &v1.Node{}, "spec.providerID", func(o client.Object) []string {
188188
return []string{o.(*v1.Node).Spec.ProviderID}
189189
}), "failed to setup node provider id indexer")
190+
lo.Must0(mgr.GetFieldIndexer().IndexField(ctx, &v1.Event{}, "involvedObject.kind", func(o client.Object) []string {
191+
return []string{o.(*v1.Event).InvolvedObject.Kind}
192+
}), "failed to setup event kind indexer")
193+
lo.Must0(mgr.GetFieldIndexer().IndexField(ctx, &v1.Event{}, "involvedObject.name", func(o client.Object) []string {
194+
return []string{o.(*v1.Event).InvolvedObject.Name}
195+
}), "failed to setup event name indexer")
196+
lo.Must0(mgr.GetFieldIndexer().IndexField(ctx, &v1.Event{}, "namespace", func(o client.Object) []string {
197+
return []string{o.(*v1.Event).Namespace}
198+
}), "failed to setup event namespace indexer")
199+
lo.Must0(mgr.GetFieldIndexer().IndexField(ctx, &v1.Event{}, "reason", func(o client.Object) []string {
200+
return []string{o.(*v1.Event).Reason}
201+
}), "failed to setup event reason indexer")
190202
lo.Must0(mgr.GetFieldIndexer().IndexField(ctx, &v1beta1.NodeClaim{}, "status.providerID", func(o client.Object) []string {
191203
return []string{o.(*v1beta1.NodeClaim).Status.ProviderID}
192204
}), "failed to setup nodeclaim provider id indexer")

0 commit comments

Comments
 (0)
Please sign in to comment.