From 4bc3db42520087e5484e890069ab631f12b329b5 Mon Sep 17 00:00:00 2001 From: Hamza El-Saawy Date: Wed, 24 Sep 2025 15:06:41 -0400 Subject: [PATCH] Handle error events in HCS notifications. The shim parses the JSON result document and handle the error events (via `processHcsResult`) returned by HCS calls (e.g.,`vmcompute.HcsCreateComputeSystem`), but ignores the JSON payload for notifications (which are either received from a `processAsyncHcsResult` in the appropriate system or process call in `"internal.hcs"`, or via `waitForNotification` in `waitBackground`). This leads to ambiguous failure errors (e.g., `"The data is invalid."`) that require ETW traces to track down the appropriate HCS logs, when the error events could have provided enough context to identify the issue. Parse the `notificationData` JSON payload provided by HCS to the `notificationWatcher` callback into the appropriate `hcsResult` struct. Validate that the JSON data matches the notification HResult. Create a new error type (`hcsResult`) to handle both the error and events. Since notification results are always subsequently passed to either `make(System|Process)Error`, update those functions to handle the events provided by `hcsResult` errors. Since `ErrorEvent`s are always converted to strings in the context of serializing several of them into a string, add an `(*ErrorEvent).writeTo(*string.Builder)` function to provide more efficient error string generation for `(HCS|System|Process)Error`s. Additionally, consolidate the joining and formatting of error events for those error types. Signed-off-by: Hamza El-Saawy --- internal/hcs/callback.go | 72 ++++++++++++++++++++---- internal/hcs/errors.go | 118 +++++++++++++++++++++++++++++---------- 2 files changed, 149 insertions(+), 41 deletions(-) diff --git a/internal/hcs/callback.go b/internal/hcs/callback.go index 7b27173c3a..38a507dffb 100644 --- a/internal/hcs/callback.go +++ b/internal/hcs/callback.go @@ -3,14 +3,18 @@ package hcs import ( + "context" + "encoding/json" "fmt" "sync" "syscall" + "github.com/sirupsen/logrus" + "github.com/Microsoft/hcsshim/internal/interop" + "github.com/Microsoft/hcsshim/internal/log" "github.com/Microsoft/hcsshim/internal/logfields" "github.com/Microsoft/hcsshim/internal/vmcompute" - "github.com/sirupsen/logrus" ) var ( @@ -132,10 +136,19 @@ func closeChannels(channels notificationChannels) { } } -func notificationWatcher(notificationType hcsNotification, callbackNumber uintptr, notificationStatus uintptr, notificationData *uint16) uintptr { - var result error - if int32(notificationStatus) < 0 { - result = interop.Win32FromHresult(notificationStatus) +func notificationWatcher( + notificationType hcsNotification, + callbackNumber uintptr, + notificationStatus uintptr, + notificationData *uint16, +) uintptr { + ctx, entry := log.SetEntry(context.Background(), logrus.Fields{ + "notification-type": notificationType.String(), + }) + + result := processNotification(ctx, notificationStatus, notificationData) + if result != nil { + entry.Data[logrus.ErrorKey] = result } callbackMapLock.RLock() @@ -143,17 +156,15 @@ func notificationWatcher(notificationType hcsNotification, callbackNumber uintpt callbackMapLock.RUnlock() if context == nil { + entry.WithField("context", callbackNumber).Warn("Received notification for unknown context (callback number)") return 0 } - log := logrus.WithFields(logrus.Fields{ - "notification-type": notificationType.String(), - "system-id": context.systemID, - }) + entry.Data["system-id"] = context.systemID if context.processID != 0 { - log.Data[logfields.ProcessID] = context.processID + entry.Data[logfields.ProcessID] = context.processID } - log.Debug("HCS notification") + entry.Debug("HCS notification") if channel, ok := context.channels[notificationType]; ok { channel <- result @@ -161,3 +172,42 @@ func notificationWatcher(notificationType hcsNotification, callbackNumber uintpt return 0 } + +// processNotification parses and validates HCS notifications and returns the result as an error. +func processNotification(ctx context.Context, notificationStatus uintptr, notificationData *uint16) (err error) { + // TODO: merge/unify with [processHcsResult] + status := int32(notificationStatus) + if status < 0 { + err = interop.Win32FromHresult(notificationStatus) + } + + if notificationData == nil { + return err + } + + resultJSON := interop.ConvertAndFreeCoTaskMemString(notificationData) + result := &hcsResult{} + if jsonErr := json.Unmarshal([]byte(resultJSON), result); jsonErr != nil { + log.G(ctx).WithFields(logrus.Fields{ + logfields.JSON: resultJSON, + logrus.ErrorKey: err, + }).Warn("Could not unmarshal HCS result") + return err + } + + // the HResult and data payload should have the same error value + if result.Error < 0 && status != result.Error { + log.G(ctx).WithFields(logrus.Fields{ + "status": status, + "data": result.Error, + }).Warn("Mismatched status and data HResult values") + } + + if len(result.ErrorEvents) > 0 { + return &resultError{ + Err: err, + Events: result.ErrorEvents, + } + } + return err +} diff --git a/internal/hcs/errors.go b/internal/hcs/errors.go index 3e10f5c7e0..a5900f35f1 100644 --- a/internal/hcs/errors.go +++ b/internal/hcs/errors.go @@ -8,9 +8,13 @@ import ( "errors" "fmt" "net" + "strings" "syscall" + "github.com/sirupsen/logrus" + "github.com/Microsoft/hcsshim/internal/log" + "github.com/Microsoft/hcsshim/internal/logfields" ) var ( @@ -99,41 +103,57 @@ type ErrorEvent struct { EventID uint16 `json:"EventId,omitempty"` Flags uint32 `json:"Flags,omitempty"` Source string `json:"Source,omitempty"` - //Data []EventData `json:"Data,omitempty"` // Omit this as HCS doesn't encode this well. It's more confusing to include. It is however logged in debug mode (see processHcsResult function) -} -type hcsResult struct { - Error int32 - ErrorMessage string - ErrorEvents []ErrorEvent `json:"ErrorEvents,omitempty"` + // Omit this as HCS doesn't encode this well. It's more confusing to include. + // It is however logged in during [processHcsResult] errors. + //Data []EventData `json:"Data,omitempty"` } func (ev *ErrorEvent) String() string { - evs := "[Event Detail: " + ev.Message + sb := strings.Builder{} + ev.writeTo(&sb) + return sb.String() +} + +func (ev *ErrorEvent) writeTo(b *strings.Builder) { + // rough wag at needed length + b.Grow(64 + len(ev.Message) + len(ev.StackTrace) + len(ev.Provider) + len(ev.Source)) + + // [strings.Builder] Write* functions always return nil errors + _, _ = b.WriteString("[Event Detail: " + ev.Message) if ev.StackTrace != "" { - evs += " Stack Trace: " + ev.StackTrace + _, _ = b.WriteString(" Stack Trace: " + ev.StackTrace) } if ev.Provider != "" { - evs += " Provider: " + ev.Provider + _, _ = b.WriteString(" Provider: " + ev.Provider) } if ev.EventID != 0 { - evs = fmt.Sprintf("%s EventID: %d", evs, ev.EventID) + fmt.Fprintf(b, " EventID: %d", ev.EventID) } if ev.Flags != 0 { - evs = fmt.Sprintf("%s flags: %d", evs, ev.Flags) + fmt.Fprintf(b, " flags: %d", ev.Flags) } if ev.Source != "" { - evs += " Source: " + ev.Source + _, _ = b.WriteString(" Source: " + ev.Source) } - evs += "]" - return evs + _ = b.WriteByte(']') +} + +type hcsResult struct { + Error int32 + ErrorMessage string + ErrorEvents []ErrorEvent `json:"ErrorEvents,omitempty"` + // TODO: AttributionRecords } func processHcsResult(ctx context.Context, resultJSON string) []ErrorEvent { if resultJSON != "" { result := &hcsResult{} if err := json.Unmarshal([]byte(resultJSON), result); err != nil { - log.G(ctx).WithError(err).Warning("Could not unmarshal HCS result") + log.G(ctx).WithFields(logrus.Fields{ + logfields.JSON: resultJSON, + logrus.ErrorKey: err, + }).Warn("Could not unmarshal HCS result") return nil } return result.ErrorEvents @@ -141,6 +161,27 @@ func processHcsResult(ctx context.Context, resultJSON string) []ErrorEvent { return nil } +// resultError allows passing the events along with the error from [notificationWatcher] +// so they can both be including in the resulting [makeSystemError]/[makeProcessError] call. +// +// errorMessage **should** be the same as err.Error(), so it can safely be ignored. +type resultError struct { + Err error + Events []ErrorEvent +} + +func (e *resultError) Error() string { + return appendErrorEvents(e.Err.Error(), e.Events) +} + +func (e *resultError) Is(target error) bool { + return errors.Is(e.Err, target) +} + +func (e *resultError) Unwrap() error { + return e.Err +} + type HcsError struct { Op string Err error @@ -150,11 +191,7 @@ type HcsError struct { var _ net.Error = &HcsError{} func (e *HcsError) Error() string { - s := e.Op + ": " + e.Err.Error() - for _, ev := range e.Events { - s += "\n" + ev.String() - } - return s + return appendErrorEvents(e.Op+": "+e.Err.Error(), e.Events) } func (e *HcsError) Is(target error) bool { @@ -194,11 +231,7 @@ type SystemError struct { var _ net.Error = &SystemError{} func (e *SystemError) Error() string { - s := e.Op + " " + e.ID + ": " + e.Err.Error() - for _, ev := range e.Events { - s += "\n" + ev.String() - } - return s + return appendErrorEvents(e.Op+" "+e.ID+": "+e.Err.Error(), e.Events) } func makeSystemError(system *System, op string, err error, events []ErrorEvent) error { @@ -208,6 +241,7 @@ func makeSystemError(system *System, op string, err error, events []ErrorEvent) return err } + events, err = getEvents(events, err) return &SystemError{ ID: system.ID(), HcsError: HcsError{ @@ -228,11 +262,7 @@ type ProcessError struct { var _ net.Error = &ProcessError{} func (e *ProcessError) Error() string { - s := fmt.Sprintf("%s %s:%d: %s", e.Op, e.SystemID, e.Pid, e.Err.Error()) - for _, ev := range e.Events { - s += "\n" + ev.String() - } - return s + return appendErrorEvents(fmt.Sprintf("%s %s:%d: %s", e.Op, e.SystemID, e.Pid, e.Err.Error()), e.Events) } func makeProcessError(process *Process, op string, err error, events []ErrorEvent) error { @@ -241,6 +271,8 @@ func makeProcessError(process *Process, op string, err error, events []ErrorEven if errors.As(err, &e) { return err } + + events, err = getEvents(events, err) return &ProcessError{ Pid: process.Pid(), SystemID: process.SystemID(), @@ -252,6 +284,32 @@ func makeProcessError(process *Process, op string, err error, events []ErrorEven } } +// getEvents checks to see if err has events associated with it, and if so, returns those. +// This is used to flatten an [HceError], [SystemError], or [ProcessError] that wraps a [resultError] +func getEvents(events []ErrorEvent, err error) ([]ErrorEvent, error) { + // only return nested events if the original events is empty. + // don't use [errors.As], since that will unwrap the error chain and drop the parents. + if resErr, ok := err.(*resultError); err != nil && ok && len(events) == 0 { //nolint:errorlint + return resErr.Events, resErr.Err + } + return events, err +} + +func appendErrorEvents(s string, events []ErrorEvent) string { + if len(events) == 0 { + return s + } + + sb := &strings.Builder{} + _, _ = sb.WriteString(s) + for _, ev := range events { + // don't join with newlines since those are ... awkward within error strings + _, _ = sb.WriteString(": ") + ev.writeTo(sb) + } + return sb.String() +} + // IsNotExist checks if an error is caused by the Container or Process not existing. // Note: Currently, ErrElementNotFound can mean that a Process has either // already exited, or does not exist. Both IsAlreadyStopped and IsNotExist