|
| 1 | +package component |
| 2 | + |
| 3 | +import ( |
| 4 | + "context" |
| 5 | + "fmt" |
| 6 | + "math" |
| 7 | + "time" |
| 8 | + |
| 9 | + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" |
| 10 | + "github.com/NexusGPU/tensor-fusion/internal/constants" |
| 11 | + ctrl "sigs.k8s.io/controller-runtime" |
| 12 | + "sigs.k8s.io/controller-runtime/pkg/client" |
| 13 | + "sigs.k8s.io/controller-runtime/pkg/log" |
| 14 | +) |
| 15 | + |
| 16 | +type Interface interface { |
| 17 | + GetName() string |
| 18 | + DetectConfigChange(pool *tfv1.GPUPool, status *tfv1.PoolComponentStatus) (bool, string, string) |
| 19 | + SetConfigHash(status *tfv1.PoolComponentStatus, hash string) |
| 20 | + GetUpdateInProgressInfo(pool *tfv1.GPUPool) string |
| 21 | + SetUpdateInProgressInfo(pool *tfv1.GPUPool, hash string) |
| 22 | + GetBatchUpdateLastTimeInfo(pool *tfv1.GPUPool) string |
| 23 | + SetBatchUpdateLastTimeInfo(pool *tfv1.GPUPool, time string) |
| 24 | + GetUpdateProgress(status *tfv1.PoolComponentStatus) int32 |
| 25 | + SetUpdateProgress(status *tfv1.PoolComponentStatus, progress int32) |
| 26 | + GetResourcesInfo(r client.Client, ctx context.Context, pool *tfv1.GPUPool, hash string) (int, int, bool, error) |
| 27 | + PerformBatchUpdate(r client.Client, ctx context.Context, pool *tfv1.GPUPool, delta int) (bool, error) |
| 28 | +} |
| 29 | + |
| 30 | +func ManageUpdate(r client.Client, ctx context.Context, pool *tfv1.GPUPool, component Interface) (*ctrl.Result, error) { |
| 31 | + log := log.FromContext(ctx) |
| 32 | + |
| 33 | + autoUpdate, batchInterval := getUpdatePolicy(pool) |
| 34 | + newStatus := pool.Status.ComponentStatus.DeepCopy() |
| 35 | + |
| 36 | + changed, configHash, oldHash := component.DetectConfigChange(pool, newStatus) |
| 37 | + if changed { |
| 38 | + log.Info("component configuration changed", "component", component.GetName(), "old hash", oldHash, "new hash", configHash) |
| 39 | + component.SetConfigHash(newStatus, configHash) |
| 40 | + component.SetUpdateProgress(newStatus, 0) |
| 41 | + if oldHash == "" || autoUpdate == false { |
| 42 | + return nil, patchComponentStatus(r, ctx, pool, newStatus) |
| 43 | + } |
| 44 | + if pool.Annotations == nil { |
| 45 | + pool.Annotations = map[string]string{} |
| 46 | + } |
| 47 | + patch := client.MergeFrom(pool.DeepCopy()) |
| 48 | + component.SetUpdateInProgressInfo(pool, configHash) |
| 49 | + component.SetBatchUpdateLastTimeInfo(pool, "") |
| 50 | + if err := r.Patch(ctx, pool, patch); err != nil { |
| 51 | + return nil, fmt.Errorf("failed to patch pool: %w", err) |
| 52 | + } |
| 53 | + } else { |
| 54 | + if autoUpdate == false || component.GetUpdateInProgressInfo(pool) != configHash { |
| 55 | + return nil, nil |
| 56 | + } |
| 57 | + if timeInfo := component.GetBatchUpdateLastTimeInfo(pool); len(timeInfo) != 0 { |
| 58 | + lastBatchUpdateTime, err := time.Parse(time.RFC3339, timeInfo) |
| 59 | + if err != nil { |
| 60 | + return nil, err |
| 61 | + } |
| 62 | + nextBatchUpdateTime := lastBatchUpdateTime.Add(batchInterval) |
| 63 | + if now := time.Now(); now.Before(nextBatchUpdateTime) { |
| 64 | + log.Info("next batch update time not yet reached", "now", now, "nextBatchUpdateTime", nextBatchUpdateTime) |
| 65 | + return &ctrl.Result{RequeueAfter: nextBatchUpdateTime.Sub(now)}, nil |
| 66 | + } |
| 67 | + log.Info("next batch update time reached", "BatchUpdateTime", nextBatchUpdateTime) |
| 68 | + } |
| 69 | + } |
| 70 | + |
| 71 | + totalSize, updatedSize, recheck, err := component.GetResourcesInfo(r, ctx, pool, configHash) |
| 72 | + if err != nil { |
| 73 | + return nil, err |
| 74 | + } else if recheck { |
| 75 | + return &ctrl.Result{RequeueAfter: constants.PendingRequeueDuration}, err |
| 76 | + } else if totalSize <= 0 { |
| 77 | + return nil, nil |
| 78 | + } |
| 79 | + |
| 80 | + batchPercentage := pool.Spec.NodeManagerConfig.NodePoolRollingUpdatePolicy.BatchPercentage |
| 81 | + updateProgress := component.GetUpdateProgress(newStatus) |
| 82 | + delta, newUpdateProgress, currentBatchIndex := calculateDesiredUpdatedDelta(totalSize, updatedSize, batchPercentage, updateProgress) |
| 83 | + component.SetUpdateProgress(newStatus, newUpdateProgress) |
| 84 | + log.Info("update in progress", "component", component.GetName(), "hash", configHash, |
| 85 | + "updateProgress", newUpdateProgress, "totalSize", totalSize, "updatedSize", updatedSize, |
| 86 | + "batchPercentage", batchPercentage, "currentBatchIndex", currentBatchIndex, "delta", delta) |
| 87 | + |
| 88 | + var ctrlResult *ctrl.Result |
| 89 | + if delta == 0 { |
| 90 | + patch := client.MergeFrom(pool.DeepCopy()) |
| 91 | + newUpdateProgress = min((currentBatchIndex+1)*batchPercentage, 100) |
| 92 | + component.SetUpdateProgress(newStatus, newUpdateProgress) |
| 93 | + if newUpdateProgress != 100 { |
| 94 | + component.SetBatchUpdateLastTimeInfo(pool, time.Now().Format(time.RFC3339)) |
| 95 | + interval := max(batchInterval, constants.PendingRequeueDuration) |
| 96 | + ctrlResult = &ctrl.Result{RequeueAfter: interval} |
| 97 | + log.Info("current batch update has completed", "progress", newUpdateProgress, "currentBatchIndex", currentBatchIndex, "nextUpdateTime", time.Now().Add(interval)) |
| 98 | + } else { |
| 99 | + component.SetUpdateInProgressInfo(pool, "") |
| 100 | + component.SetBatchUpdateLastTimeInfo(pool, "") |
| 101 | + log.Info("all batch update has completed", "component", component.GetName(), "hash", configHash) |
| 102 | + } |
| 103 | + if err := r.Patch(ctx, pool, patch); err != nil { |
| 104 | + return nil, fmt.Errorf("failed to patch pool: %w", err) |
| 105 | + } |
| 106 | + } else if delta > 0 { |
| 107 | + recheck, err := component.PerformBatchUpdate(r, ctx, pool, int(delta)) |
| 108 | + if err != nil { |
| 109 | + return nil, err |
| 110 | + } else if recheck { |
| 111 | + ctrlResult = &ctrl.Result{RequeueAfter: constants.PendingRequeueDuration} |
| 112 | + } |
| 113 | + } |
| 114 | + |
| 115 | + return ctrlResult, patchComponentStatus(r, ctx, pool, newStatus) |
| 116 | +} |
| 117 | + |
| 118 | +func patchComponentStatus(r client.Client, ctx context.Context, pool *tfv1.GPUPool, newStatus *tfv1.PoolComponentStatus) error { |
| 119 | + patch := client.MergeFrom(pool.DeepCopy()) |
| 120 | + pool.Status.ComponentStatus = *newStatus |
| 121 | + if err := r.Status().Patch(ctx, pool, patch); err != nil { |
| 122 | + return fmt.Errorf("failed to patch pool status: %w", err) |
| 123 | + } |
| 124 | + return nil |
| 125 | +} |
| 126 | + |
| 127 | +func getUpdatePolicy(pool *tfv1.GPUPool) (bool, time.Duration) { |
| 128 | + autoUpdate := false |
| 129 | + batchInterval := time.Duration(600) * time.Second |
| 130 | + |
| 131 | + if pool.Spec.NodeManagerConfig != nil { |
| 132 | + updatePolicy := pool.Spec.NodeManagerConfig.NodePoolRollingUpdatePolicy |
| 133 | + if updatePolicy != nil { |
| 134 | + if updatePolicy.AutoUpdate != nil { |
| 135 | + autoUpdate = *updatePolicy.AutoUpdate |
| 136 | + } |
| 137 | + |
| 138 | + duration, err := time.ParseDuration(updatePolicy.BatchInterval) |
| 139 | + if err == nil { |
| 140 | + batchInterval = duration |
| 141 | + } |
| 142 | + } |
| 143 | + } |
| 144 | + |
| 145 | + return autoUpdate, batchInterval |
| 146 | +} |
| 147 | + |
| 148 | +func calculateDesiredUpdatedDelta(total int, updatedSize int, batchPercentage int32, updateProgress int32) (int32, int32, int32) { |
| 149 | + batchSize := getValueFromPercent(int(batchPercentage), total, true) |
| 150 | + var delta, desiredSize, currentBatchIndex int32 |
| 151 | + newUpdateProgress := updateProgress |
| 152 | + for { |
| 153 | + currentBatchIndex = newUpdateProgress / batchPercentage |
| 154 | + desiredSize = min((currentBatchIndex+1)*int32(batchSize), int32(total)) |
| 155 | + delta = desiredSize - int32(updatedSize) |
| 156 | + // if rolling udpate policy changed or new nodes were added during update, we need to update progress |
| 157 | + if delta < 0 { |
| 158 | + newUpdateProgress = min(newUpdateProgress+batchPercentage, 100) |
| 159 | + } else { |
| 160 | + break |
| 161 | + } |
| 162 | + } |
| 163 | + |
| 164 | + return delta, newUpdateProgress, currentBatchIndex |
| 165 | +} |
| 166 | + |
| 167 | +func getValueFromPercent(percent int, total int, roundUp bool) int { |
| 168 | + if roundUp { |
| 169 | + return int(math.Ceil(float64(percent) * (float64(total)) / 100)) |
| 170 | + } else { |
| 171 | + return int(math.Floor(float64(percent) * (float64(total)) / 100)) |
| 172 | + } |
| 173 | +} |
0 commit comments