@@ -875,7 +875,7 @@ func (c *ModelServingController) scaleDownRoles(ctx context.Context, ms *workloa
875875
876876// scaleUpRoles handles Role scaling up.
877877// It creates new Roles with increasing indices starting from the current max index + 1.
878- func (c * ModelServingController ) scaleUpRoles (ctx context.Context , ms * workloadv1alpha1.ModelServing , groupName string , targetRole workloadv1alpha1.Role , roleList []datastore.Role , expectedCount int , servingGroupOrdinal int , effectiveRevision string ) {
878+ func (c * ModelServingController ) scaleUpRoles (ctx context.Context , ms * workloadv1alpha1.ModelServing , groupName string , targetRole workloadv1alpha1.Role , roleList []datastore.Role , expectedCount int , servingGroupOrdinal int , revision string ) {
879879 startingIndex := 0
880880 if len (roleList ) > 0 {
881881 _ , ordinal := utils .GetParentNameAndOrdinal (roleList [len (roleList )- 1 ].Name )
@@ -899,13 +899,13 @@ func (c *ModelServingController) scaleUpRoles(ctx context.Context, ms *workloadv
899899 for i := 0 ; i < toCreate ; i ++ {
900900 newIndex := startingIndex + i
901901 // Create pods for role
902- err := c .CreatePodsByRole (ctx , * targetRole .DeepCopy (), ms , newIndex , servingGroupOrdinal , effectiveRevision )
902+ err := c .CreatePodsByRole (ctx , * targetRole .DeepCopy (), ms , newIndex , servingGroupOrdinal , revision )
903903 if err != nil {
904904 klog .Errorf ("create role %s for ServingGroup %s failed: %v" , utils .GenerateRoleID (targetRole .Name , newIndex ), groupName , err )
905905 } else {
906906 // Insert new Role to global storage
907907 roleID := utils .GenerateRoleID (targetRole .Name , newIndex )
908- c .store .AddRole (utils .GetNamespaceName (ms ), groupName , targetRole .Name , roleID , effectiveRevision )
908+ c .store .AddRole (utils .GetNamespaceName (ms ), groupName , targetRole .Name , roleID , revision )
909909 // Emit event for new role entering Creating state
910910 message := fmt .Sprintf ("Role %s/%s in ServingGroup %s is now Creating" , targetRole .Name , roleID , groupName )
911911 c .emitRoleStatusEvent (ms , corev1 .EventTypeNormal , "RoleCreating" , message )
@@ -917,9 +917,10 @@ func (c *ModelServingController) scaleUpRoles(ctx context.Context, ms *workloadv
917917// It handles both scale up and scale down operations for the role
918918func (c * ModelServingController ) manageRoleReplicas (ctx context.Context , ms * workloadv1alpha1.ModelServing , groupName string , targetRole workloadv1alpha1.Role , servingGroupOrdinal int , newRevision string ) {
919919 // TODO: add podGroup update after gang scheduler finished
920- effectiveRevision := newRevision
921- if revision , ok := c .store .GetServingGroupRevision (utils .GetNamespaceName (ms ), groupName ); ok && revision != "" {
922- effectiveRevision = revision
920+ // Use the stored revision for existing groups (partition-protected), otherwise use newRevision
921+ revision := newRevision
922+ if storedRevision , ok := c .store .GetServingGroupRevision (utils .GetNamespaceName (ms ), groupName ); ok && storedRevision != "" {
923+ revision = storedRevision
923924 }
924925
925926 // Get all replicas of a role from storage, for example, prefill-0, prefill-1...
@@ -955,7 +956,7 @@ func (c *ModelServingController) manageRoleReplicas(ctx context.Context, ms *wor
955956 if len (pods ) < expectedPods {
956957 klog .V (2 ).Infof ("manageRoleReplicas: role %s/%s in ServingGroup %s is missing pods (%d/%d), recreating" , targetRole .Name , roleObj .Name , groupName , len (pods ), expectedPods )
957958 _ , roleIndex := utils .GetParentNameAndOrdinal (roleObj .Name )
958- if err := c .CreatePodsByRole (ctx , * targetRole .DeepCopy (), ms , roleIndex , servingGroupOrdinal , effectiveRevision ); err != nil {
959+ if err := c .CreatePodsByRole (ctx , * targetRole .DeepCopy (), ms , roleIndex , servingGroupOrdinal , revision ); err != nil {
959960 klog .Errorf ("manageRoleReplicas: failed to recreate pods for role %s/%s in ServingGroup %s: %v" , targetRole .Name , roleObj .Name , groupName , err )
960961 }
961962 }
@@ -964,7 +965,7 @@ func (c *ModelServingController) manageRoleReplicas(ctx context.Context, ms *wor
964965 // Determine whether it is a scale-up or scale-down scenario
965966 if len (roleList ) < expectedCount {
966967 klog .V (2 ).Infof ("manageRoleReplicas: scaling UP role %s in ServingGroup %s: current=%d, expected=%d" , targetRole .Name , groupName , len (roleList ), expectedCount )
967- c .scaleUpRoles (ctx , ms , groupName , targetRole , roleList , expectedCount , servingGroupOrdinal , effectiveRevision )
968+ c .scaleUpRoles (ctx , ms , groupName , targetRole , roleList , expectedCount , servingGroupOrdinal , revision )
968969 } else if len (roleList ) > expectedCount {
969970 klog .V (2 ).Infof ("manageRoleReplicas: scaling DOWN role %s in ServingGroup %s: current=%d, expected=%d" , targetRole .Name , groupName , len (roleList ), expectedCount )
970971 c .scaleDownRoles (ctx , ms , groupName , targetRole , roleList , expectedCount )
0 commit comments