-
Notifications
You must be signed in to change notification settings - Fork 84
Fix/modelserving partition revision regressions #840
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
c37c696
94f4bc9
3cb3dbb
a7cb67b
943c22d
61381d6
c232fa7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -875,7 +875,7 @@ func (c *ModelServingController) scaleDownRoles(ctx context.Context, ms *workloa | |
|
|
||
| // scaleUpRoles handles Role scaling up. | ||
| // It creates new Roles with increasing indices starting from the current max index + 1. | ||
| func (c *ModelServingController) scaleUpRoles(ctx context.Context, ms *workloadv1alpha1.ModelServing, groupName string, targetRole workloadv1alpha1.Role, roleList []datastore.Role, expectedCount int, servingGroupOrdinal int, newRevision string) { | ||
| func (c *ModelServingController) scaleUpRoles(ctx context.Context, ms *workloadv1alpha1.ModelServing, groupName string, targetRole workloadv1alpha1.Role, roleList []datastore.Role, expectedCount int, servingGroupOrdinal int, revision string) { | ||
| startingIndex := 0 | ||
| if len(roleList) > 0 { | ||
| _, ordinal := utils.GetParentNameAndOrdinal(roleList[len(roleList)-1].Name) | ||
|
|
@@ -899,13 +899,13 @@ func (c *ModelServingController) scaleUpRoles(ctx context.Context, ms *workloadv | |
| for i := 0; i < toCreate; i++ { | ||
| newIndex := startingIndex + i | ||
| // Create pods for role | ||
| err := c.CreatePodsByRole(ctx, *targetRole.DeepCopy(), ms, newIndex, servingGroupOrdinal, newRevision) | ||
| err := c.CreatePodsByRole(ctx, *targetRole.DeepCopy(), ms, newIndex, servingGroupOrdinal, revision) | ||
| if err != nil { | ||
| klog.Errorf("create role %s for ServingGroup %s failed: %v", utils.GenerateRoleID(targetRole.Name, newIndex), groupName, err) | ||
| } else { | ||
| // Insert new Role to global storage | ||
| roleID := utils.GenerateRoleID(targetRole.Name, newIndex) | ||
| c.store.AddRole(utils.GetNamespaceName(ms), groupName, targetRole.Name, roleID, newRevision) | ||
| c.store.AddRole(utils.GetNamespaceName(ms), groupName, targetRole.Name, roleID, revision) | ||
| // Emit event for new role entering Creating state | ||
| message := fmt.Sprintf("Role %s/%s in ServingGroup %s is now Creating", targetRole.Name, roleID, groupName) | ||
| c.emitRoleStatusEvent(ms, corev1.EventTypeNormal, "RoleCreating", message) | ||
|
|
@@ -917,6 +917,12 @@ func (c *ModelServingController) scaleUpRoles(ctx context.Context, ms *workloadv | |
| // It handles both scale up and scale down operations for the role | ||
| func (c *ModelServingController) manageRoleReplicas(ctx context.Context, ms *workloadv1alpha1.ModelServing, groupName string, targetRole workloadv1alpha1.Role, servingGroupOrdinal int, newRevision string) { | ||
| // TODO: add podGroup update after gang scheduler finished | ||
| // Use the stored revision for existing groups (partition-protected), otherwise use newRevision | ||
| revision := newRevision | ||
| if storedRevision, ok := c.store.GetServingGroupRevision(utils.GetNamespaceName(ms), groupName); ok && storedRevision != "" { | ||
| revision = storedRevision | ||
| } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i think rolling updates should rely on manageServingGroupRollingUpdate + current/update revision + partition in status; old revisions in the store are not the authoritative source for rolling updates.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If the source is And use the revision information from ModelServing.Status. It is also not possible to determine which serving groups need to be upgraded. |
||
|
|
||
| // Get all replicas of a role from storage, for example, prefill-0, prefill-1... | ||
| roleList, err := c.store.GetRoleList(utils.GetNamespaceName(ms), groupName, targetRole.Name) | ||
| if err != nil { | ||
|
|
@@ -950,7 +956,7 @@ func (c *ModelServingController) manageRoleReplicas(ctx context.Context, ms *wor | |
| if len(pods) < expectedPods { | ||
| klog.V(2).Infof("manageRoleReplicas: role %s/%s in ServingGroup %s is missing pods (%d/%d), recreating", targetRole.Name, roleObj.Name, groupName, len(pods), expectedPods) | ||
| _, roleIndex := utils.GetParentNameAndOrdinal(roleObj.Name) | ||
| if err := c.CreatePodsByRole(ctx, *targetRole.DeepCopy(), ms, roleIndex, servingGroupOrdinal, newRevision); err != nil { | ||
| if err := c.CreatePodsByRole(ctx, *targetRole.DeepCopy(), ms, roleIndex, servingGroupOrdinal, revision); err != nil { | ||
| klog.Errorf("manageRoleReplicas: failed to recreate pods for role %s/%s in ServingGroup %s: %v", targetRole.Name, roleObj.Name, groupName, err) | ||
| } | ||
| } | ||
|
|
@@ -959,7 +965,7 @@ func (c *ModelServingController) manageRoleReplicas(ctx context.Context, ms *wor | |
| // Determine whether it is a scale-up or scale-down scenario | ||
| if len(roleList) < expectedCount { | ||
| klog.V(2).Infof("manageRoleReplicas: scaling UP role %s in ServingGroup %s: current=%d, expected=%d", targetRole.Name, groupName, len(roleList), expectedCount) | ||
| c.scaleUpRoles(ctx, ms, groupName, targetRole, roleList, expectedCount, servingGroupOrdinal, newRevision) | ||
| c.scaleUpRoles(ctx, ms, groupName, targetRole, roleList, expectedCount, servingGroupOrdinal, revision) | ||
| } else if len(roleList) > expectedCount { | ||
katara-Jayprakash marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| klog.V(2).Infof("manageRoleReplicas: scaling DOWN role %s in ServingGroup %s: current=%d, expected=%d", targetRole.Name, groupName, len(roleList), expectedCount) | ||
| c.scaleDownRoles(ctx, ms, groupName, targetRole, roleList, expectedCount) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2557,6 +2557,151 @@ func TestManageRoleReplicas(t *testing.T) { | |
| } | ||
| } | ||
|
|
||
| func TestManageRoleReplicas_RecreateUsesServingGroupRevision(t *testing.T) { | ||
| kubeClient := kubefake.NewSimpleClientset() | ||
| kthenaClient := kthenafake.NewSimpleClientset() | ||
| volcanoClient := volcanofake.NewSimpleClientset() | ||
| apiextClient := apiextfake.NewSimpleClientset(testhelper.CreatePodGroupCRD()) | ||
|
|
||
| controller, err := NewModelServingController(kubeClient, kthenaClient, volcanoClient, apiextClient) | ||
| assert.NoError(t, err) | ||
|
|
||
| roleName := "default" | ||
| oldRevision := "revision-v1" | ||
| newRevision := "revision-v2" | ||
|
|
||
| ms := &workloadv1alpha1.ModelServing{ | ||
| ObjectMeta: metav1.ObjectMeta{ | ||
| Namespace: "default", | ||
| Name: "test-manage-role-group-rev", | ||
| UID: types.UID("ms-uid-group-rev"), | ||
| }, | ||
| Spec: workloadv1alpha1.ModelServingSpec{ | ||
| Replicas: ptr.To[int32](1), | ||
| Template: workloadv1alpha1.ServingGroup{ | ||
| Roles: []workloadv1alpha1.Role{ | ||
| { | ||
| Name: roleName, | ||
| Replicas: ptr.To[int32](1), | ||
| WorkerReplicas: 0, | ||
| EntryTemplate: workloadv1alpha1.PodTemplateSpec{ | ||
| Spec: corev1.PodSpec{ | ||
| Containers: []corev1.Container{{ | ||
| Name: "entry-container", | ||
| Image: "test-image:latest", | ||
| }}, | ||
| }, | ||
| }, | ||
| }, | ||
| }, | ||
| }, | ||
| RecoveryPolicy: workloadv1alpha1.RoleRecreate, | ||
| }, | ||
| } | ||
|
|
||
| groupName := utils.GenerateServingGroupName(ms.Name, 0) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. could you please construct a real case, say |
||
| controller.store.AddServingGroup(utils.GetNamespaceName(ms), 0, oldRevision) | ||
| controller.store.AddRole(utils.GetNamespaceName(ms), groupName, roleName, utils.GenerateRoleID(roleName, 0), oldRevision) | ||
|
|
||
| controller.manageRoleReplicas(context.Background(), ms, groupName, ms.Spec.Template.Roles[0], 0, newRevision) | ||
|
|
||
| selector := labels.SelectorFromSet(map[string]string{ | ||
| workloadv1alpha1.GroupNameLabelKey: groupName, | ||
| workloadv1alpha1.RoleLabelKey: roleName, | ||
| }) | ||
| pods, err := kubeClient.CoreV1().Pods(ms.Namespace).List(context.Background(), metav1.ListOptions{LabelSelector: selector.String()}) | ||
| assert.NoError(t, err) | ||
| if assert.Len(t, pods.Items, 1) { | ||
| assert.Equal(t, oldRevision, pods.Items[0].Labels[workloadv1alpha1.RevisionLabelKey], | ||
| "recreated pod for existing serving group should keep the serving group revision") | ||
| } | ||
| } | ||
|
|
||
| // TestManageRoleReplicas_PartitionedScaleUp verifies that during a partitioned rolling update, | ||
| // group 0 (protected by partition=1) uses old revision while group 1 uses new revision. | ||
| func TestManageRoleReplicas_PartitionedScaleUp(t *testing.T) { | ||
| kubeClient := kubefake.NewSimpleClientset() | ||
| kthenaClient := kthenafake.NewSimpleClientset() | ||
| volcanoClient := volcanofake.NewSimpleClientset() | ||
| apiextClient := apiextfake.NewSimpleClientset(testhelper.CreatePodGroupCRD()) | ||
|
|
||
| controller, err := NewModelServingController(kubeClient, kthenaClient, volcanoClient, apiextClient) | ||
| assert.NoError(t, err) | ||
|
|
||
| roleName := "default" | ||
| oldRevision := "revision-v1" | ||
| newRevision := "revision-v2" | ||
|
|
||
| ms := &workloadv1alpha1.ModelServing{ | ||
| ObjectMeta: metav1.ObjectMeta{ | ||
| Namespace: "default", | ||
| Name: "test-partition-scale", | ||
| UID: types.UID("ms-uid-partition"), | ||
| }, | ||
| Spec: workloadv1alpha1.ModelServingSpec{ | ||
| Replicas: ptr.To[int32](2), | ||
| Template: workloadv1alpha1.ServingGroup{ | ||
| Roles: []workloadv1alpha1.Role{ | ||
| { | ||
| Name: roleName, | ||
| Replicas: ptr.To[int32](1), | ||
| WorkerReplicas: 0, | ||
| EntryTemplate: workloadv1alpha1.PodTemplateSpec{ | ||
| Spec: corev1.PodSpec{ | ||
| Containers: []corev1.Container{{ | ||
| Name: "entry-container", | ||
| Image: "test-image:latest", | ||
| }}, | ||
| }, | ||
| }, | ||
| }, | ||
| }, | ||
| }, | ||
| RecoveryPolicy: workloadv1alpha1.RoleRecreate, | ||
| }, | ||
| } | ||
|
|
||
| // Setup: group 0 exists with old revision (protected by partition=1) | ||
| group0Name := utils.GenerateServingGroupName(ms.Name, 0) | ||
| controller.store.AddServingGroup(utils.GetNamespaceName(ms), 0, oldRevision) | ||
| controller.store.AddRole(utils.GetNamespaceName(ms), group0Name, roleName, utils.GenerateRoleID(roleName, 0), oldRevision) | ||
|
|
||
| // Setup: group 1 exists with new revision (above partition, should use new revision) | ||
| group1Name := utils.GenerateServingGroupName(ms.Name, 1) | ||
| controller.store.AddServingGroup(utils.GetNamespaceName(ms), 1, newRevision) | ||
| controller.store.AddRole(utils.GetNamespaceName(ms), group1Name, roleName, utils.GenerateRoleID(roleName, 0), newRevision) | ||
|
|
||
| // Scale up roles in group 0 - should use old revision (partition-protected) | ||
| controller.manageRoleReplicas(context.Background(), ms, group0Name, ms.Spec.Template.Roles[0], 0, newRevision) | ||
|
|
||
| // Scale up roles in group 1 - should use new revision | ||
| controller.manageRoleReplicas(context.Background(), ms, group1Name, ms.Spec.Template.Roles[0], 1, newRevision) | ||
|
|
||
| // Verify group 0 pods use old revision | ||
| selector0 := labels.SelectorFromSet(map[string]string{ | ||
| workloadv1alpha1.GroupNameLabelKey: group0Name, | ||
| workloadv1alpha1.RoleLabelKey: roleName, | ||
| }) | ||
| pods0, err := kubeClient.CoreV1().Pods(ms.Namespace).List(context.Background(), metav1.ListOptions{LabelSelector: selector0.String()}) | ||
| assert.NoError(t, err) | ||
| if assert.Len(t, pods0.Items, 1) { | ||
| assert.Equal(t, oldRevision, pods0.Items[0].Labels[workloadv1alpha1.RevisionLabelKey], | ||
| "group 0 (partition-protected) should use old revision") | ||
| } | ||
|
|
||
| // Verify group 1 pods use new revision | ||
| selector1 := labels.SelectorFromSet(map[string]string{ | ||
| workloadv1alpha1.GroupNameLabelKey: group1Name, | ||
| workloadv1alpha1.RoleLabelKey: roleName, | ||
| }) | ||
| pods1, err := kubeClient.CoreV1().Pods(ms.Namespace).List(context.Background(), metav1.ListOptions{LabelSelector: selector1.String()}) | ||
| assert.NoError(t, err) | ||
| if assert.Len(t, pods1.Items, 1) { | ||
| assert.Equal(t, newRevision, pods1.Items[0].Labels[workloadv1alpha1.RevisionLabelKey], | ||
| "group 1 (above partition) should use new revision") | ||
| } | ||
| } | ||
|
|
||
| // TestScaleDownServingGroups tests the scaleDownServingGroups function with various scenarios | ||
| func TestScaleDownServingGroups(t *testing.T) { | ||
| tests := []struct { | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.