@@ -310,20 +310,27 @@ func (p *Provisioner) Schedule(ctx context.Context) (scheduler.Results, error) {
310
310
return scheduler.Results {}, err
311
311
}
312
312
pods := append (pendingPods , deletingNodePods ... )
313
+ // filter pods which are alredy handled in last 3 minute
314
+ targetPods := lo .FilterMap (pods , func (pod * v1.Pod , _ int ) (* v1.Pod , bool ) {
315
+ if p .isPodHandled (ctx , pod ) {
316
+ return nil , false
317
+ }
318
+ return pod , true
319
+ })
313
320
// nothing to schedule, so just return success
314
- if len (pods ) == 0 {
321
+ if len (targetPods ) == 0 {
315
322
return scheduler.Results {}, nil
316
323
}
317
- s , err := p .NewScheduler (ctx , pods , nodes .Active ())
324
+ s , err := p .NewScheduler (ctx , targetPods , nodes .Active ())
318
325
if err != nil {
319
326
if errors .Is (err , ErrNodePoolsNotFound ) {
320
327
logging .FromContext (ctx ).Info (ErrNodePoolsNotFound )
321
328
return scheduler.Results {}, nil
322
329
}
323
330
return scheduler.Results {}, fmt .Errorf ("creating scheduler, %w" , err )
324
331
}
325
- results := s .Solve (ctx , pods ).TruncateInstanceTypes (scheduler .MaxInstanceTypes )
326
- logging .FromContext (ctx ).With ("pods" , pretty .Slice (lo .Map (pods , func (p * v1.Pod , _ int ) string { return client .ObjectKeyFromObject (p ).String () }), 5 )).
332
+ results := s .Solve (ctx , targetPods ).TruncateInstanceTypes (scheduler .MaxInstanceTypes )
333
+ logging .FromContext (ctx ).With ("pods" , pretty .Slice (lo .Map (targetPods , func (p * v1.Pod , _ int ) string { return client .ObjectKeyFromObject (p ).String () }), 5 )).
327
334
With ("duration" , time .Since (start )).
328
335
Infof ("found provisionable pod(s)" )
329
336
results .Record (ctx , p .recorder , p .cluster )
@@ -419,6 +426,31 @@ func (p *Provisioner) Validate(ctx context.Context, pod *v1.Pod) error {
419
426
)
420
427
}
421
428
429
+ func (p * Provisioner ) isPodHandled (ctx context.Context , pod * v1.Pod ) bool {
430
+ var events v1.EventList
431
+ filter := client.MatchingFields {
432
+ "namespace" : pod .Namespace ,
433
+ "involvedObject.kind" : "Pod" ,
434
+ "involvedObject.name" : pod .Name ,
435
+ "reason" : "HandledByKarpenter" ,
436
+ }
437
+ logging .FromContext (ctx ).Debugf ("get event for %s/%s" , pod .Namespace , pod .Name )
438
+ if err := p .kubeClient .List (ctx , & events , filter ); err == nil {
439
+ for _ , event := range events .Items {
440
+ logging .FromContext (ctx ).Debugf ("found event %s/%s" , pod .Namespace , event .Name )
441
+ // ignore the pod if it's already handled in 3 minute
442
+ if time .Now ().Before (event .LastTimestamp .Time .Add (3 * time .Minute )) {
443
+ logging .FromContext (ctx ).Infof ("pod %s/%s is handled" , pod .Namespace , pod .Name )
444
+ return true
445
+ }
446
+ }
447
+ } else {
448
+ logging .FromContext (ctx ).Errorf ("failed to get event for %s/%s: %w" , pod .Namespace , pod .Name , err )
449
+ }
450
+ p .recorder .Publish (scheduler .PodHandledEvent (pod ))
451
+ return false
452
+ }
453
+
422
454
// validateKarpenterManagedLabelCanExist provides a more clear error message in the event of scheduling a pod that specifically doesn't
423
455
// want to run on a Karpenter node (e.g. a Karpenter controller replica).
424
456
func validateKarpenterManagedLabelCanExist (p * v1.Pod ) error {
0 commit comments