Skip to content

Commit

Permalink
Added logic to update ModelStatus (kserve#2088)
Browse files Browse the repository at this point in the history
* added logic to update model status

Signed-off-by: Suresh Nakkeran <[email protected]>

* added tests for modelstatus support

Signed-off-by: Suresh Nakkeran <[email protected]>

* model status changes - incorporated review commands

Signed-off-by: Suresh Nakkeran <[email protected]>

* adding more tests for modelstatus changes

Signed-off-by: Suresh Nakkeran <[email protected]>
  • Loading branch information
Suresh-Nakkeran authored May 28, 2022
1 parent 8a99af2 commit 1a55119
Show file tree
Hide file tree
Showing 11 changed files with 730 additions and 2 deletions.
1 change: 1 addition & 0 deletions config/crd/serving.kserve.io_inferenceservices.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13055,6 +13055,7 @@ spec:
enum:
- ModelLoadFailed
- RuntimeUnhealthy
- RuntimeDisabled
- NoSupportingRuntime
- RuntimeNotRecognized
- InvalidPredictorSpec
Expand Down
8 changes: 8 additions & 0 deletions config/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,14 @@ rules:
- get
- list
- watch
- apiGroups:
- ""
resources:
- pods
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
Expand Down
117 changes: 116 additions & 1 deletion pkg/apis/serving/v1beta1/inference_service_status.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ limitations under the License.
package v1beta1

import (
"reflect"

"github.com/kserve/kserve/pkg/constants"
appsv1 "k8s.io/api/apps/v1"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -185,7 +188,7 @@ const (
)

// FailureReason enum
// +kubebuilder:validation:Enum=ModelLoadFailed;RuntimeUnhealthy;NoSupportingRuntime;RuntimeNotRecognized;InvalidPredictorSpec
// +kubebuilder:validation:Enum=ModelLoadFailed;RuntimeUnhealthy;RuntimeDisabled;NoSupportingRuntime;RuntimeNotRecognized;InvalidPredictorSpec
type FailureReason string

// FailureReason enum values
Expand All @@ -194,6 +197,8 @@ const (
ModelLoadFailed FailureReason = "ModelLoadFailed"
// Corresponding ServingRuntime containers failed to start or are unhealthy
RuntimeUnhealthy FailureReason = "RuntimeUnhealthy"
// The ServingRuntime is disabled
RuntimeDisabled FailureReason = "RuntimeDisabled"
// There are no ServingRuntime which support the specified model type
NoSupportingRuntime FailureReason = "NoSupportingRuntime"
// There is no ServingRuntime defined with the specified runtime name
Expand Down Expand Up @@ -394,3 +399,113 @@ func (ss *InferenceServiceStatus) ClearCondition(conditionType apis.ConditionTyp
conditionSet.Manage(ss).ClearCondition(conditionType)
}
}

func (ss *InferenceServiceStatus) UpdateModelRevisionStates(modelState ModelState, totalCopies int, info *FailureInfo) {
if ss.ModelStatus.ModelRevisionStates == nil {
ss.ModelStatus.ModelRevisionStates = &ModelRevisionStates{TargetModelState: modelState}
} else {
ss.ModelStatus.ModelRevisionStates.TargetModelState = modelState
}
// Update transition status, failure info based on new model state
if modelState == Pending || modelState == Loading {
ss.ModelStatus.TransitionStatus = InProgress
} else if modelState == Loaded {
ss.ModelStatus.TransitionStatus = UpToDate
ss.ModelStatus.ModelCopies = &ModelCopies{TotalCopies: totalCopies}
ss.ModelStatus.ModelRevisionStates.ActiveModelState = Loaded
} else if modelState == FailedToLoad {
ss.ModelStatus.TransitionStatus = BlockedByFailedLoad
}
if info != nil {
ss.SetModelFailureInfo(info)
}
}

func (ss *InferenceServiceStatus) UpdateModelTransitionStatus(status TransitionStatus, info *FailureInfo) {
ss.ModelStatus.TransitionStatus = status
// Update model state to 'FailedToLoad' in case of invalid spec provided
if ss.ModelStatus.TransitionStatus == InvalidSpec {
if ss.ModelStatus.ModelRevisionStates == nil {
ss.ModelStatus.ModelRevisionStates = &ModelRevisionStates{TargetModelState: FailedToLoad}
} else {
ss.ModelStatus.ModelRevisionStates.TargetModelState = FailedToLoad
}
}
if info != nil {
ss.SetModelFailureInfo(info)
}
}

func (ss *InferenceServiceStatus) SetModelFailureInfo(info *FailureInfo) bool {
if reflect.DeepEqual(info, ss.ModelStatus.LastFailureInfo) {
return false
}
ss.ModelStatus.LastFailureInfo = info
return true
}

func (ss *InferenceServiceStatus) PropagateModelStatus(statusSpec ComponentStatusSpec, podList *v1.PodList, rawDeplyment bool) {
// Check at least one pod is running for the latest revision of inferenceservice
totalCopies := len(podList.Items)
if totalCopies == 0 {
ss.UpdateModelRevisionStates(Pending, totalCopies, nil)
return
}
// Update model state to 'Loaded' if inferenceservice status is ready.
// For serverless deployment, the latest created revision and the latest ready revision should be equal
if ss.IsReady() {
if rawDeplyment {
ss.UpdateModelRevisionStates(Loaded, totalCopies, nil)
return
} else if statusSpec.LatestCreatedRevision == statusSpec.LatestReadyRevision {
ss.UpdateModelRevisionStates(Loaded, totalCopies, nil)
return
}
}
// Update model state to 'Loading' if storage initializer is running.
// If the storage initializer is terminated due to error or crashloopbackoff, update model
// state to 'ModelLoadFailed' with failure info.
for _, cs := range podList.Items[0].Status.InitContainerStatuses {
if cs.Name == constants.StorageInitializerContainerName {
if cs.State.Running != nil {
ss.UpdateModelRevisionStates(Loading, totalCopies, nil)
return
} else if cs.State.Terminated != nil &&
cs.State.Terminated.Reason == constants.StateReasonError {
ss.UpdateModelRevisionStates(FailedToLoad, totalCopies, &FailureInfo{
Reason: ModelLoadFailed,
Message: cs.State.Terminated.Message,
})
return
} else if cs.State.Waiting != nil &&
cs.State.Waiting.Reason == constants.StateReasonCrashLoopBackOff {
ss.UpdateModelRevisionStates(FailedToLoad, totalCopies, &FailureInfo{
Reason: ModelLoadFailed,
Message: cs.LastTerminationState.Terminated.Message,
})
return
}
}
}
// If the kserve container is terminated due to error or crashloopbackoff, update model
// state to 'ModelLoadFailed' with failure info.
for _, cs := range podList.Items[0].Status.ContainerStatuses {
if cs.Name == constants.InferenceServiceContainerName {
if cs.State.Terminated != nil &&
cs.State.Terminated.Reason == constants.StateReasonError {
ss.UpdateModelRevisionStates(FailedToLoad, totalCopies, &FailureInfo{
Reason: ModelLoadFailed,
Message: cs.State.Terminated.Message,
})
} else if cs.State.Waiting != nil &&
cs.State.Waiting.Reason == constants.StateReasonCrashLoopBackOff {
ss.UpdateModelRevisionStates(FailedToLoad, totalCopies, &FailureInfo{
Reason: ModelLoadFailed,
Message: cs.LastTerminationState.Terminated.Message,
})
} else {
ss.UpdateModelRevisionStates(Pending, totalCopies, nil)
}
}
}
}
17 changes: 16 additions & 1 deletion pkg/constants/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,8 @@ const (

// InferenceService container name
const (
InferenceServiceContainerName = "kserve-container"
InferenceServiceContainerName = "kserve-container"
StorageInitializerContainerName = "storage-initializer"
)

// DefaultModelLocalMountPath is where models will be mounted by the storage-initializer
Expand Down Expand Up @@ -339,6 +340,20 @@ const (
Unknown
)

// revision label
const (
RevisionLabel = "serving.knative.dev/revision"
RawDeploymentAppLabel = "app"
)

// container state reason
const (
StateReasonRunning = "Running"
StateReasonCompleted = "Completed"
StateReasonError = "Error"
StateReasonCrashLoopBackOff = "CrashLoopBackOff"
)

// GetRawServiceLabel generate native service label
func GetRawServiceLabel(service string) string {
return "isvc." + service
Expand Down
12 changes: 12 additions & 0 deletions pkg/controller/v1alpha1/trainedmodel/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,12 @@ var _ = Describe("v1beta1 TrainedModel controller", func() {
},
},
}
modelStatus = v1beta1.ModelStatus{
TransitionStatus: v1beta1.UpToDate,
ModelRevisionStates: &v1beta1.ModelRevisionStates{
ActiveModelState: v1beta1.Loaded,
},
}
)

Context("When creating a new TrainedModel with an unready InferenceService", func() {
Expand Down Expand Up @@ -234,6 +240,7 @@ var _ = Describe("v1beta1 TrainedModel controller", func() {
}, timeout, interval).Should(BeTrue())

inferenceService.Status.Status = readyConditions
inferenceService.Status.ModelStatus = modelStatus
Expect(k8sClient.Status().Update(context.TODO(), inferenceService)).To(BeNil())

// Create modelConfig
Expand Down Expand Up @@ -349,6 +356,7 @@ var _ = Describe("v1beta1 TrainedModel controller", func() {
URL: clusterURL,
}
inferenceService.Status.Status = readyConditions
inferenceService.Status.ModelStatus = modelStatus
Expect(k8sClient.Status().Update(context.TODO(), inferenceService)).To(BeNil())

tmInstance := &v1alpha1api.TrainedModel{
Expand Down Expand Up @@ -496,6 +504,7 @@ var _ = Describe("v1beta1 TrainedModel controller", func() {
}, timeout, interval).Should(BeTrue())

inferenceService.Status.Status = readyConditions
inferenceService.Status.ModelStatus = modelStatus
Expect(k8sClient.Status().Update(context.TODO(), inferenceService)).To(BeNil())

tmInstance := &v1alpha1api.TrainedModel{
Expand Down Expand Up @@ -624,6 +633,7 @@ var _ = Describe("v1beta1 TrainedModel controller", func() {
}, timeout, interval).Should(BeTrue())

inferenceService.Status.Status = readyConditions
inferenceService.Status.ModelStatus = modelStatus
Expect(k8sClient.Status().Update(context.TODO(), inferenceService)).To(BeNil())

// Create modelConfig
Expand Down Expand Up @@ -755,6 +765,7 @@ var _ = Describe("v1beta1 TrainedModel controller", func() {
}, timeout, interval).Should(BeTrue())

inferenceService.Status.Status = readyConditions
inferenceService.Status.ModelStatus = modelStatus
Expect(k8sClient.Status().Update(context.TODO(), inferenceService)).To(BeNil())

// Create modelConfig
Expand Down Expand Up @@ -885,6 +896,7 @@ var _ = Describe("v1beta1 TrainedModel controller", func() {
}, timeout, interval).Should(BeTrue())

inferenceService.Status.Status = readyConditions
inferenceService.Status.ModelStatus = modelStatus
Expect(k8sClient.Status().Update(context.TODO(), inferenceService)).To(BeNil())

// Create modelConfig
Expand Down
54 changes: 54 additions & 0 deletions pkg/controller/v1beta1/inferenceservice/components/predictor.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,20 +93,36 @@ func (p *Predictor) Reconcile(isvc *v1beta1.InferenceService) (ctrl.Result, erro
isvc.SetRuntimeDefaults()
r, err := isvcutils.GetServingRuntime(p.client, *isvc.Spec.Predictor.Model.Runtime, isvc.Namespace)
if err != nil {
isvc.Status.UpdateModelTransitionStatus(v1beta1.InvalidSpec, &v1beta1.FailureInfo{
Reason: v1beta1.RuntimeNotRecognized,
Message: "Waiting for runtime to become available",
})
return ctrl.Result{}, err
}

if r.IsDisabled() {
isvc.Status.UpdateModelTransitionStatus(v1beta1.InvalidSpec, &v1beta1.FailureInfo{
Reason: v1beta1.RuntimeDisabled,
Message: "Specified runtime is disabled",
})
return ctrl.Result{}, fmt.Errorf("specified runtime %s is disabled", *isvc.Spec.Predictor.Model.Runtime)
}

if isvc.Spec.Predictor.Model.ProtocolVersion != nil &&
!r.IsProtocolVersionSupported(*isvc.Spec.Predictor.Model.ProtocolVersion) {
isvc.Status.UpdateModelTransitionStatus(v1beta1.InvalidSpec, &v1beta1.FailureInfo{
Reason: v1beta1.NoSupportingRuntime,
Message: "Specified runtime does not support specified protocol version",
})
return ctrl.Result{}, fmt.Errorf("specified runtime %s does not support specified protocol version", *isvc.Spec.Predictor.Model.Runtime)
}

// Verify that the selected runtime supports the specified framework.
if !isvc.Spec.Predictor.Model.RuntimeSupportsModel(r) {
isvc.Status.UpdateModelTransitionStatus(v1beta1.InvalidSpec, &v1beta1.FailureInfo{
Reason: v1beta1.NoSupportingRuntime,
Message: "Specified runtime does not support specified framework/version",
})
return ctrl.Result{}, fmt.Errorf("specified runtime %s does not support specified framework/version", *isvc.Spec.Predictor.Model.Runtime)
}

Expand All @@ -117,6 +133,10 @@ func (p *Predictor) Reconcile(isvc *v1beta1.InferenceService) (ctrl.Result, erro
return ctrl.Result{}, err
}
if len(runtimes) == 0 {
isvc.Status.UpdateModelTransitionStatus(v1beta1.InvalidSpec, &v1beta1.FailureInfo{
Reason: v1beta1.NoSupportingRuntime,
Message: "No runtime found to support specified framework/version",
})
return ctrl.Result{}, fmt.Errorf("no runtime found to support predictor with model type: %v", isvc.Spec.Predictor.Model.ModelFormat)
}
// Get first supporting runtime.
Expand All @@ -137,16 +157,28 @@ func (p *Predictor) Reconcile(isvc *v1beta1.InferenceService) (ctrl.Result, erro
}

if len(sRuntime.Containers) == 0 {
isvc.Status.UpdateModelTransitionStatus(v1beta1.InvalidSpec, &v1beta1.FailureInfo{
Reason: v1beta1.InvalidPredictorSpec,
Message: "No container configuration found in selected serving runtime",
})
return ctrl.Result{}, errors.New("no container configuration found in selected serving runtime")
}
// Assume only one container is specified in runtime spec.
container, err = isvcutils.MergeRuntimeContainers(&sRuntime.Containers[0], &isvc.Spec.Predictor.Model.Container)
if err != nil {
isvc.Status.UpdateModelTransitionStatus(v1beta1.InvalidSpec, &v1beta1.FailureInfo{
Reason: v1beta1.InvalidPredictorSpec,
Message: "Failed to get runtime container",
})
return ctrl.Result{}, errors.Wrapf(err, "failed to get runtime container")
}

mergedPodSpec, err := isvcutils.MergePodSpec(&sRuntime.ServingRuntimePodSpec, &isvc.Spec.Predictor.PodSpec)
if err != nil {
isvc.Status.UpdateModelTransitionStatus(v1beta1.InvalidSpec, &v1beta1.FailureInfo{
Reason: v1beta1.InvalidPredictorSpec,
Message: "Failed to consolidate serving runtime PodSpecs",
})
return ctrl.Result{}, errors.Wrapf(err, "failed to consolidate serving runtime PodSpecs")
}

Expand All @@ -155,6 +187,10 @@ func (p *Predictor) Reconcile(isvc *v1beta1.InferenceService) (ctrl.Result, erro

// Replace placeholders in runtime container by values from inferenceservice metadata
if err = isvcutils.ReplacePlaceholders(container, isvc.ObjectMeta); err != nil {
isvc.Status.UpdateModelTransitionStatus(v1beta1.InvalidSpec, &v1beta1.FailureInfo{
Reason: v1beta1.InvalidPredictorSpec,
Message: "Failed to replace placeholders in serving runtime Container",
})
return ctrl.Result{}, errors.Wrapf(err, "failed to replace placeholders in serving runtime Container")
}

Expand Down Expand Up @@ -205,8 +241,14 @@ func (p *Predictor) Reconcile(isvc *v1beta1.InferenceService) (ctrl.Result, erro
return ctrl.Result{}, err
}

var rawDeployment bool
var podLabelKey string
var podLabelValue string

// Here we allow switch between knative and vanilla deployment
if isvcutils.GetDeploymentMode(annotations, deployConfig) == constants.RawDeployment {
rawDeployment = true
podLabelKey = constants.RawDeploymentAppLabel
r, err := raw.NewRawKubeReconciler(p.client, p.scheme, objectMeta, &isvc.Spec.Predictor.ComponentExtensionSpec,
&podSpec)
if err != nil {
Expand All @@ -233,6 +275,7 @@ func (p *Predictor) Reconcile(isvc *v1beta1.InferenceService) (ctrl.Result, erro
}
isvc.Status.PropagateRawStatus(v1beta1.PredictorComponent, deployment, r.URL)
} else {
podLabelKey = constants.RevisionLabel
r := knative.NewKsvcReconciler(p.client, p.scheme, objectMeta, &isvc.Spec.Predictor.ComponentExtensionSpec,
&podSpec, isvc.Status.Components[v1beta1.PredictorComponent])
if err := controllerutil.SetControllerReference(isvc, r.Service, p.scheme); err != nil {
Expand All @@ -244,6 +287,17 @@ func (p *Predictor) Reconcile(isvc *v1beta1.InferenceService) (ctrl.Result, erro
}
isvc.Status.PropagateStatus(v1beta1.PredictorComponent, status)
}
statusSpec, _ := isvc.Status.Components[v1beta1.PredictorComponent]
if rawDeployment {
podLabelValue = constants.GetRawServiceLabel(constants.DefaultPredictorServiceName(isvc.ObjectMeta.Name))
} else {
podLabelValue = statusSpec.LatestCreatedRevision
}
podList, err := isvcutils.ListPodsByLabel(p.client, isvc.ObjectMeta.Namespace, podLabelKey, podLabelValue)
if err != nil {
return ctrl.Result{}, errors.Wrapf(err, "fails to list inferenceservice pods by label")
}
isvc.Status.PropagateModelStatus(statusSpec, podList, rawDeployment)

return ctrl.Result{}, nil
}
Loading

0 comments on commit 1a55119

Please sign in to comment.