Skip to content

Commit 5788f1c

Browse files
Merge pull request #528 from Tof1973/OSD-20998
OSD-20998 make infra resize more resilient to temporary error
2 parents 600b2a6 + 01e51b8 commit 5788f1c

File tree

1 file changed

+42
-8
lines changed

1 file changed

+42
-8
lines changed

cmd/cluster/resize/infra_node.go

Lines changed: 42 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,8 @@ func (r *Resize) RunInfra(ctx context.Context) error {
122122
nodes := &corev1.NodeList{}
123123

124124
if err := r.client.List(ctx, nodes, &client.ListOptions{LabelSelector: selector}); err != nil {
125-
return false, err
125+
log.Printf("error retrieving nodes list, continuing to wait: %s", err)
126+
return false, nil
126127
}
127128

128129
readyNodes := 0
@@ -193,7 +194,8 @@ func (r *Resize) RunInfra(ctx context.Context) error {
193194
if apierrors.IsNotFound(err) {
194195
return true, nil
195196
}
196-
return false, err
197+
log.Printf("error retrieving machines list, continuing to wait: %s", err)
198+
return false, nil
197199
}
198200

199201
log.Printf("original machinepool %s/%s still exists, continuing to wait", originalMp.Namespace, originalMp.Name)
@@ -205,7 +207,7 @@ func (r *Resize) RunInfra(ctx context.Context) error {
205207
// Wait for original nodes to delete
206208
if err := wait.PollImmediate(twentySecondIncrement, twentyMinuteTimeout, func() (bool, error) {
207209
// Re-check for originalNodes to see if they have been deleted
208-
return r.nodesMatchExpectedCount(ctx, originalNodeSelector, 0)
210+
return skipError(wrapResult(r.nodesMatchExpectedCount(ctx, originalNodeSelector, 0)), "error matching expected count")
209211
}); err != nil {
210212
switch {
211213
case errors.Is(err, wait.ErrWaitTimeout):
@@ -219,7 +221,7 @@ func (r *Resize) RunInfra(ctx context.Context) error {
219221

220222
if err := wait.PollImmediate(twentySecondIncrement, twentyMinuteTimeout, func() (bool, error) {
221223
log.Printf("waiting for nodes to terminate")
222-
return r.nodesMatchExpectedCount(ctx, originalNodeSelector, 0)
224+
return skipError(wrapResult(r.nodesMatchExpectedCount(ctx, originalNodeSelector, 0)), "error matching expected count")
223225
}); err != nil {
224226
if errors.Is(err, wait.ErrWaitTimeout) {
225227
log.Printf("timed out waiting for nodes to terminate: %v.", err.Error())
@@ -242,11 +244,13 @@ func (r *Resize) RunInfra(ctx context.Context) error {
242244
nodes := &corev1.NodeList{}
243245
selector, err := labels.Parse("node-role.kubernetes.io/infra=")
244246
if err != nil {
247+
// This should never happen, so we do not have to skip this error
245248
return false, err
246249
}
247250

248251
if err := r.client.List(ctx, nodes, &client.ListOptions{LabelSelector: selector}); err != nil {
249-
return false, err
252+
log.Printf("error retrieving nodes list, continuing to wait: %s", err)
253+
return false, nil
250254
}
251255

252256
readyNodes := 0
@@ -292,7 +296,8 @@ func (r *Resize) RunInfra(ctx context.Context) error {
292296
if apierrors.IsNotFound(err) {
293297
return true, nil
294298
}
295-
return false, err
299+
log.Printf("error retrieving old machine details, continuing to wait: %s", err)
300+
return false, nil
296301
}
297302

298303
log.Printf("temporary machinepool %s/%s still exists, continuing to wait", tempMp.Namespace, tempMp.Name)
@@ -307,11 +312,13 @@ func (r *Resize) RunInfra(ctx context.Context) error {
307312
nodes := &corev1.NodeList{}
308313
selector, err := labels.Parse("node-role.kubernetes.io/infra=")
309314
if err != nil {
315+
// This should never happen, so we do not have to skip this errorreturn false, err
310316
return false, err
311317
}
312318

313319
if err := r.client.List(ctx, nodes, &client.ListOptions{LabelSelector: selector}); err != nil {
314-
return false, err
320+
log.Printf("error retrieving nodes list, continuing to wait: %s", err)
321+
return false, nil
315322
}
316323

317324
switch len(nodes.Items) {
@@ -334,7 +341,7 @@ func (r *Resize) RunInfra(ctx context.Context) error {
334341

335342
if err := wait.PollImmediate(twentySecondIncrement, twentyMinuteTimeout, func() (bool, error) {
336343
log.Printf("waiting for nodes to terminate")
337-
return r.nodesMatchExpectedCount(ctx, tempNodeSelector, 0)
344+
return skipError(wrapResult(r.nodesMatchExpectedCount(ctx, tempNodeSelector, 0)), "error matching expected count")
338345
}); err != nil {
339346
if errors.Is(err, wait.ErrWaitTimeout) {
340347
log.Printf("timed out waiting for nodes to terminate: %v.", err.Error())
@@ -536,3 +543,30 @@ func (r *Resize) nodesMatchExpectedCount(ctx context.Context, labelSelector labe
536543

537544
return false, nil
538545
}
546+
547+
// having an error when being in a rety loop, should not be handled as an error, and we should just display it and continue
548+
// in case we have a function that return a bool status and an error, we can use following helper
549+
// f being a function returning (bool, error), replace
550+
//
551+
// return f(...)
552+
//
553+
// by
554+
//
555+
// return skipError(wrapResult(f(...)), "message to context the error")
556+
//
557+
// and then the return will always have error set to nil, but a continuing message will be displayed in case of error
558+
type result struct {
559+
condition bool
560+
err error
561+
}
562+
563+
func wrapResult(condition bool, err error) result {
564+
return result{condition, err}
565+
}
566+
567+
func skipError(res result, msg string) (bool, error) {
568+
if res.err != nil {
569+
log.Printf("%s, continuing to wait: %s", msg, res.err)
570+
}
571+
return res.condition, nil
572+
}

0 commit comments

Comments
 (0)