Skip to content

Commit 124e5a3

Browse files
committed
Handle error events in HCS notifications.
The shim parses the JSON result document and handle the error events (via `processHcsResult`) returned by HCS calls (e.g.,`vmcompute.HcsCreateComputeSystem`), but ignores the JSON payload for notifications (which are either received from a `processAsyncHcsResult` in the appropriate system or process call in `"internal.hcs"`, or via `waitForNotification` in `waitBackground`). This leads to ambiguous failure errors (e.g., `"The data is invalid."`) that require ETW traces to track down the appropriate HCS logs, when the error events could have provided enough context to identify the issue. Parse the `notificationData` JSON payload provided by HCS to the `notificationWatcher` callback into the appropriate `hcsResult` struct. Validate that the JSON data matches the notification HResult. Create a new error type (`hcsResult`) to handle both the error and events. Since notification results are always subsequently passed to either `make(System|Process)Error`, update those functions to handle the events provided by `hcsResult` errors. Since `ErrorEvent`s are always converted to strings in the context of serializing several of them into a string, add an `(*ErrorEvent).writeTo(*string.Builder)` function to provide more efficient error string generation for `(HCS|System|Process)Error`s. Additionally, consolidate the joining and formatting of error events for those error types. Signed-off-by: Hamza El-Saawy <hamzaelsaawy@microsoft.com>
1 parent 9e4e729 commit 124e5a3

File tree

2 files changed

+155
-45
lines changed

2 files changed

+155
-45
lines changed

internal/hcs/callback.go

Lines changed: 65 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,18 @@
33
package hcs
44

55
import (
6+
"context"
7+
"encoding/json"
68
"fmt"
79
"sync"
810
"syscall"
911

12+
"github.com/sirupsen/logrus"
13+
1014
"github.com/Microsoft/hcsshim/internal/interop"
15+
"github.com/Microsoft/hcsshim/internal/log"
1116
"github.com/Microsoft/hcsshim/internal/logfields"
1217
"github.com/Microsoft/hcsshim/internal/vmcompute"
13-
"github.com/sirupsen/logrus"
1418
)
1519

1620
var (
@@ -132,32 +136,78 @@ func closeChannels(channels notificationChannels) {
132136
}
133137
}
134138

135-
func notificationWatcher(notificationType hcsNotification, callbackNumber uintptr, notificationStatus uintptr, notificationData *uint16) uintptr {
136-
var result error
137-
if int32(notificationStatus) < 0 {
138-
result = interop.Win32FromHresult(notificationStatus)
139+
func notificationWatcher(
140+
notificationType hcsNotification,
141+
callbackNumber uintptr,
142+
notificationStatus uintptr,
143+
notificationData *uint16,
144+
) uintptr {
145+
ctx, entry := log.SetEntry(context.Background(), logrus.Fields{
146+
"notification-type": notificationType.String(),
147+
})
148+
149+
result := processNotification(ctx, notificationStatus, notificationData)
150+
if result != nil {
151+
entry.Data[logrus.ErrorKey] = result
139152
}
140153

141154
callbackMapLock.RLock()
142-
context := callbackMap[callbackNumber]
155+
callbackCtx := callbackMap[callbackNumber]
143156
callbackMapLock.RUnlock()
144157

145-
if context == nil {
158+
if callbackCtx == nil {
159+
entry.WithField("callbackNumber", callbackNumber).Warn("Received notification for unknown callback number")
146160
return 0
147161
}
148162

149-
log := logrus.WithFields(logrus.Fields{
150-
"notification-type": notificationType.String(),
151-
"system-id": context.systemID,
152-
})
153-
if context.processID != 0 {
154-
log.Data[logfields.ProcessID] = context.processID
163+
entry.Data["system-id"] = callbackCtx.systemID
164+
if callbackCtx.processID != 0 {
165+
entry.Data[logfields.ProcessID] = callbackCtx.processID
155166
}
156-
log.Debug("HCS notification")
167+
entry.Debug("HCS notification")
157168

158-
if channel, ok := context.channels[notificationType]; ok {
169+
if channel, ok := callbackCtx.channels[notificationType]; ok {
159170
channel <- result
160171
}
161172

162173
return 0
163174
}
175+
176+
// processNotification parses and validates HCS notifications and returns the result as an error.
177+
func processNotification(ctx context.Context, notificationStatus uintptr, notificationData *uint16) (err error) {
178+
// TODO: merge/unify with [processHcsResult]
179+
status := int32(notificationStatus)
180+
if status < 0 {
181+
err = interop.Win32FromHresult(notificationStatus)
182+
}
183+
184+
if notificationData == nil {
185+
return err
186+
}
187+
188+
resultJSON := interop.ConvertAndFreeCoTaskMemString(notificationData)
189+
result := &hcsResult{}
190+
if jsonErr := json.Unmarshal([]byte(resultJSON), result); jsonErr != nil {
191+
log.G(ctx).WithFields(logrus.Fields{
192+
logfields.JSON: resultJSON,
193+
logrus.ErrorKey: err,
194+
}).Warn("Could not unmarshal HCS result")
195+
return err
196+
}
197+
198+
// the HResult and data payload should have the same error value
199+
if result.Error < 0 && status < 0 && status != result.Error {
200+
log.G(ctx).WithFields(logrus.Fields{
201+
"status": status,
202+
"data": result.Error,
203+
}).Warn("Mismatched status and data HResult values")
204+
}
205+
206+
if len(result.ErrorEvents) > 0 {
207+
return &resultError{
208+
Err: err,
209+
Events: result.ErrorEvents,
210+
}
211+
}
212+
return err
213+
}

internal/hcs/errors.go

Lines changed: 90 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,13 @@ import (
88
"errors"
99
"fmt"
1010
"net"
11+
"strings"
1112
"syscall"
1213

14+
"github.com/sirupsen/logrus"
15+
1316
"github.com/Microsoft/hcsshim/internal/log"
17+
"github.com/Microsoft/hcsshim/internal/logfields"
1418
)
1519

1620
var (
@@ -99,48 +103,85 @@ type ErrorEvent struct {
99103
EventID uint16 `json:"EventId,omitempty"`
100104
Flags uint32 `json:"Flags,omitempty"`
101105
Source string `json:"Source,omitempty"`
102-
//Data []EventData `json:"Data,omitempty"` // Omit this as HCS doesn't encode this well. It's more confusing to include. It is however logged in debug mode (see processHcsResult function)
103-
}
104106

105-
type hcsResult struct {
106-
Error int32
107-
ErrorMessage string
108-
ErrorEvents []ErrorEvent `json:"ErrorEvents,omitempty"`
107+
// Omit this as HCS doesn't encode this well. It's more confusing to include.
108+
// It is however logged in during [processHcsResult] errors.
109+
//Data []EventData `json:"Data,omitempty"`
109110
}
110111

111112
func (ev *ErrorEvent) String() string {
112-
evs := "[Event Detail: " + ev.Message
113+
sb := strings.Builder{}
114+
ev.writeTo(&sb)
115+
return sb.String()
116+
}
117+
118+
func (ev *ErrorEvent) writeTo(b *strings.Builder) {
119+
// rough wag at needed length
120+
b.Grow(64 + len(ev.Message) + len(ev.StackTrace) + len(ev.Provider) + len(ev.Source))
121+
122+
// [strings.Builder] Write* functions always return nil errors
123+
_, _ = b.WriteString("[Event Detail: " + ev.Message)
113124
if ev.StackTrace != "" {
114-
evs += " Stack Trace: " + ev.StackTrace
125+
_, _ = b.WriteString(" Stack Trace: " + ev.StackTrace)
115126
}
116127
if ev.Provider != "" {
117-
evs += " Provider: " + ev.Provider
128+
_, _ = b.WriteString(" Provider: " + ev.Provider)
118129
}
119130
if ev.EventID != 0 {
120-
evs = fmt.Sprintf("%s EventID: %d", evs, ev.EventID)
131+
fmt.Fprintf(b, " EventID: %d", ev.EventID)
121132
}
122133
if ev.Flags != 0 {
123-
evs = fmt.Sprintf("%s flags: %d", evs, ev.Flags)
134+
fmt.Fprintf(b, " flags: %d", ev.Flags)
124135
}
125136
if ev.Source != "" {
126-
evs += " Source: " + ev.Source
137+
_, _ = b.WriteString(" Source: " + ev.Source)
127138
}
128-
evs += "]"
129-
return evs
139+
_ = b.WriteByte(']')
140+
}
141+
142+
type hcsResult struct {
143+
Error int32
144+
ErrorMessage string
145+
ErrorEvents []ErrorEvent `json:"ErrorEvents,omitempty"`
146+
// TODO: AttributionRecords
130147
}
131148

132149
func processHcsResult(ctx context.Context, resultJSON string) []ErrorEvent {
133150
if resultJSON != "" {
134151
result := &hcsResult{}
135152
if err := json.Unmarshal([]byte(resultJSON), result); err != nil {
136-
log.G(ctx).WithError(err).Warning("Could not unmarshal HCS result")
153+
log.G(ctx).WithFields(logrus.Fields{
154+
logfields.JSON: resultJSON,
155+
logrus.ErrorKey: err,
156+
}).Warn("Could not unmarshal HCS result")
137157
return nil
138158
}
139159
return result.ErrorEvents
140160
}
141161
return nil
142162
}
143163

164+
// resultError allows passing the events along with the error from [notificationWatcher]
165+
// so they can both be including in the resulting [makeSystemError]/[makeProcessError] call.
166+
//
167+
// errorMessage **should** be the same as err.Error(), so it can safely be ignored.
168+
type resultError struct {
169+
Err error
170+
Events []ErrorEvent
171+
}
172+
173+
func (e *resultError) Error() string {
174+
return appendErrorEvents(e.Err.Error(), e.Events)
175+
}
176+
177+
func (e *resultError) Is(target error) bool {
178+
return errors.Is(e.Err, target)
179+
}
180+
181+
func (e *resultError) Unwrap() error {
182+
return e.Err
183+
}
184+
144185
type HcsError struct {
145186
Op string
146187
Err error
@@ -150,11 +191,7 @@ type HcsError struct {
150191
var _ net.Error = &HcsError{}
151192

152193
func (e *HcsError) Error() string {
153-
s := e.Op + ": " + e.Err.Error()
154-
for _, ev := range e.Events {
155-
s += "\n" + ev.String()
156-
}
157-
return s
194+
return appendErrorEvents(e.Op+": "+e.Err.Error(), e.Events)
158195
}
159196

160197
func (e *HcsError) Is(target error) bool {
@@ -194,11 +231,7 @@ type SystemError struct {
194231
var _ net.Error = &SystemError{}
195232

196233
func (e *SystemError) Error() string {
197-
s := e.Op + " " + e.ID + ": " + e.Err.Error()
198-
for _, ev := range e.Events {
199-
s += "\n" + ev.String()
200-
}
201-
return s
234+
return appendErrorEvents(e.Op+" "+e.ID+": "+e.Err.Error(), e.Events)
202235
}
203236

204237
func makeSystemError(system *System, op string, err error, events []ErrorEvent) error {
@@ -208,6 +241,7 @@ func makeSystemError(system *System, op string, err error, events []ErrorEvent)
208241
return err
209242
}
210243

244+
events, err = getEvents(events, err)
211245
return &SystemError{
212246
ID: system.ID(),
213247
HcsError: HcsError{
@@ -228,11 +262,7 @@ type ProcessError struct {
228262
var _ net.Error = &ProcessError{}
229263

230264
func (e *ProcessError) Error() string {
231-
s := fmt.Sprintf("%s %s:%d: %s", e.Op, e.SystemID, e.Pid, e.Err.Error())
232-
for _, ev := range e.Events {
233-
s += "\n" + ev.String()
234-
}
235-
return s
265+
return appendErrorEvents(fmt.Sprintf("%s %s:%d: %s", e.Op, e.SystemID, e.Pid, e.Err.Error()), e.Events)
236266
}
237267

238268
func makeProcessError(process *Process, op string, err error, events []ErrorEvent) error {
@@ -241,6 +271,8 @@ func makeProcessError(process *Process, op string, err error, events []ErrorEven
241271
if errors.As(err, &e) {
242272
return err
243273
}
274+
275+
events, err = getEvents(events, err)
244276
return &ProcessError{
245277
Pid: process.Pid(),
246278
SystemID: process.SystemID(),
@@ -252,6 +284,34 @@ func makeProcessError(process *Process, op string, err error, events []ErrorEven
252284
}
253285
}
254286

287+
// getEvents checks to see if err is a [resultError] that has events associated with it, and if so,
288+
// returns the unwrapped error and events.
289+
// This is used to flatten an [SystemError] or [ProcessError] that wraps a [resultError]
290+
func getEvents(events []ErrorEvent, err error) ([]ErrorEvent, error) {
291+
// only return nested events if the original events is empty.
292+
// don't use [errors.As], since that will unwrap the error chain and drop the parents.
293+
if resErr, ok := err.(*resultError); err != nil && ok && len(events) == 0 { //nolint:errorlint
294+
return resErr.Events, resErr.Err
295+
}
296+
return events, err
297+
}
298+
299+
// common formatting for error strings followed by event data,
300+
func appendErrorEvents(s string, events []ErrorEvent) string {
301+
if len(events) == 0 {
302+
return s
303+
}
304+
305+
sb := &strings.Builder{}
306+
_, _ = sb.WriteString(s)
307+
for _, ev := range events {
308+
// don't join with newlines since those are ... awkward within error strings
309+
_, _ = sb.WriteString(": ")
310+
ev.writeTo(sb)
311+
}
312+
return sb.String()
313+
}
314+
255315
// IsNotExist checks if an error is caused by the Container or Process not existing.
256316
// Note: Currently, ErrElementNotFound can mean that a Process has either
257317
// already exited, or does not exist. Both IsAlreadyStopped and IsNotExist

0 commit comments

Comments
 (0)