Skip to content

Commit bfe78cc

Browse files
authored
Make restart pod more flexible to different failure scenarios (#4340)
1 parent 3fd1048 commit bfe78cc

File tree

2 files changed

+260
-24
lines changed

2 files changed

+260
-24
lines changed

controllers/actions.github.com/ephemeralrunner_controller.go

Lines changed: 28 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -313,37 +313,38 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
313313
cs := runnerContainerStatus(pod)
314314
switch {
315315
case pod.Status.Phase == corev1.PodFailed: // All containers are stopped
316-
switch {
317-
case pod.Status.Reason == "Evicted":
318-
log.Info("Pod evicted; Deleting ephemeral runner or pod",
319-
"podPhase", pod.Status.Phase,
320-
"podReason", pod.Status.Reason,
321-
"podMessage", pod.Status.Message,
322-
)
323-
324-
return ctrl.Result{}, r.deleteEphemeralRunnerOrPod(ctx, ephemeralRunner, pod, log)
325-
326-
case strings.HasPrefix(pod.Status.Reason, "OutOf"): // most likely a transient issue.
327-
log.Info("Pod failed with reason starting with OutOf. Deleting ephemeral runner or pod",
328-
"podPhase", pod.Status.Phase,
329-
"podReason", pod.Status.Reason,
330-
"podMessage", pod.Status.Message,
331-
)
316+
log.Info("Pod is in failed phase, inspecting runner container status",
317+
"podReason", pod.Status.Reason,
318+
"podMessage", pod.Status.Message,
319+
"podConditions", pod.Status.Conditions,
320+
)
321+
// If the runner pod did not have chance to start, terminated state may not be set.
322+
// Therefore, we should try to restart it.
323+
if cs == nil || cs.State.Terminated == nil {
324+
log.Info("Runner container does not have state set, deleting pod as failed so it can be restarted")
332325
return ctrl.Result{}, r.deleteEphemeralRunnerOrPod(ctx, ephemeralRunner, pod, log)
326+
}
333327

334-
default:
335-
log.Info("Pod is in failed phase; updating ephemeral runner status",
336-
"podPhase", pod.Status.Phase,
337-
"podReason", pod.Status.Reason,
338-
"podMessage", pod.Status.Message,
339-
)
340-
if err := r.updateRunStatusFromPod(ctx, ephemeralRunner, pod, log); err != nil {
341-
log.Info("Failed to update ephemeral runner status. Requeue to not miss this event")
328+
if cs.State.Terminated.ExitCode == 0 {
329+
log.Info("Runner container has succeeded but pod is in failed phase; Assume successful exit")
330+
// If the pod is in a failed state, that means that at least one container exited with non-zero exit code.
331+
// If the runner container exits with 0, we assume that the runner has finished successfully.
332+
// If side-car container exits with non-zero, it shouldn't affect the runner. Runner exit code
333+
// drives the controller's inference of whether the job has succeeded or failed.
334+
if err := r.Delete(ctx, ephemeralRunner); err != nil {
335+
log.Error(err, "Failed to delete ephemeral runner after successful completion")
342336
return ctrl.Result{}, err
343337
}
344338
return ctrl.Result{}, nil
345339
}
346340

341+
log.Error(
342+
errors.New("ephemeral runner container has failed, with runner container exit code non-zero"),
343+
"Ephemeral runner container has failed, and runner container termination exit code is non-zero",
344+
"containerTerminatedState", cs.State.Terminated,
345+
)
346+
return ctrl.Result{}, r.deleteEphemeralRunnerOrPod(ctx, ephemeralRunner, pod, log)
347+
347348
case cs == nil:
348349
// starting, no container state yet
349350
log.Info("Waiting for runner container status to be available")
@@ -397,6 +398,7 @@ func (r *EphemeralRunnerReconciler) deleteEphemeralRunnerOrPod(ctx context.Conte
397398
log.Info("Removed the runner from the service")
398399
return nil
399400
}
401+
400402
if err := r.deletePodAsFailed(ctx, ephemeralRunner, pod, log); err != nil {
401403
log.Error(err, "Failed to delete runner pod on failure")
402404
return err
@@ -570,6 +572,8 @@ func (r *EphemeralRunnerReconciler) markAsFailed(ctx context.Context, ephemeralR
570572

571573
// deletePodAsFailed is responsible for deleting the pod and updating the .Status.Failures for tracking failure count.
572574
// It should not be responsible for setting the status to Failed.
575+
//
576+
// It should be called by deleteEphemeralRunnerOrPod which is responsible for deciding whether to delete the EphemeralRunner or just the Pod.
573577
func (r *EphemeralRunnerReconciler) deletePodAsFailed(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, pod *corev1.Pod, log logr.Logger) error {
574578
if pod.DeletionTimestamp.IsZero() {
575579
log.Info("Deleting the ephemeral runner pod", "podId", pod.UID)

controllers/actions.github.com/ephemeralrunner_controller_test.go

Lines changed: 232 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,238 @@ var _ = Describe("EphemeralRunner", func() {
261261
).Should(BeTrue(), "Ephemeral runner should eventually be deleted")
262262
})
263263

264+
It("It should delete ephemeral runner when pod failed before runner state is recorded and job assigned", func() {
265+
er := new(v1alpha1.EphemeralRunner)
266+
Eventually(func() error {
267+
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er)
268+
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get ephemeral runner")
269+
270+
er.Status.JobID = "1"
271+
err := k8sClient.Status().Update(ctx, er)
272+
Expect(err).To(BeNil(), "failed to update ephemeral runner status")
273+
274+
Eventually(func() (string, error) {
275+
current := new(v1alpha1.EphemeralRunner)
276+
if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, current); err != nil {
277+
return "", err
278+
}
279+
return current.Status.JobID, nil
280+
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeEquivalentTo("1"))
281+
282+
pod := new(corev1.Pod)
283+
Eventually(func() (bool, error) {
284+
if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil {
285+
return false, err
286+
}
287+
return true, nil
288+
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeEquivalentTo(true))
289+
290+
pod.Status.Phase = corev1.PodFailed
291+
pod.Status.ContainerStatuses = nil
292+
err = k8sClient.Status().Update(ctx, pod)
293+
Expect(err).To(BeNil(), "Failed to update pod status")
294+
295+
Eventually(func() bool {
296+
check := new(v1alpha1.EphemeralRunner)
297+
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, check)
298+
return kerrors.IsNotFound(err)
299+
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeTrue(), "Ephemeral runner should eventually be deleted")
300+
})
301+
302+
It("It should delete ephemeral runner when pod failed before runner state is recorded and job not assigned", func() {
303+
pod := new(corev1.Pod)
304+
Eventually(func() (bool, error) {
305+
if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil {
306+
return false, err
307+
}
308+
return true, nil
309+
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeEquivalentTo(true))
310+
311+
oldPodUID := pod.UID
312+
313+
pod.Status.Phase = corev1.PodFailed
314+
pod.Status.ContainerStatuses = nil
315+
err := k8sClient.Status().Update(ctx, pod)
316+
Expect(err).To(BeNil(), "Failed to update pod status")
317+
318+
Eventually(
319+
func() (int, error) {
320+
updated := new(v1alpha1.EphemeralRunner)
321+
err := k8sClient.Get(
322+
ctx,
323+
client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace},
324+
updated,
325+
)
326+
if err != nil {
327+
return 0, err
328+
}
329+
return len(updated.Status.Failures), nil
330+
},
331+
ephemeralRunnerTimeout,
332+
ephemeralRunnerInterval,
333+
).Should(BeEquivalentTo(1))
334+
335+
Eventually(
336+
func() (bool, error) {
337+
newPod := new(corev1.Pod)
338+
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, newPod)
339+
if err != nil {
340+
return false, err
341+
}
342+
return newPod.UID != oldPodUID, nil
343+
},
344+
ephemeralRunnerTimeout,
345+
ephemeralRunnerInterval,
346+
).Should(BeTrue(), "Pod should be re-created")
347+
})
348+
349+
It("It should treat pod failed with runner container exit 0 as success with job id", func() {
350+
er := new(v1alpha1.EphemeralRunner)
351+
Eventually(func() error {
352+
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er)
353+
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get ephemeral runner")
354+
355+
er.Status.JobID = "1"
356+
err := k8sClient.Status().Update(ctx, er)
357+
Expect(err).To(BeNil(), "failed to update ephemeral runner status")
358+
359+
pod := new(corev1.Pod)
360+
Eventually(
361+
func() error {
362+
if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil {
363+
return err
364+
}
365+
return nil
366+
},
367+
ephemeralRunnerTimeout,
368+
ephemeralRunnerInterval,
369+
).Should(Succeed(), "failed to get pod")
370+
371+
pod.Status.Phase = corev1.PodFailed
372+
pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{
373+
Name: v1alpha1.EphemeralRunnerContainerName,
374+
State: corev1.ContainerState{
375+
Terminated: &corev1.ContainerStateTerminated{
376+
ExitCode: 0,
377+
},
378+
},
379+
})
380+
err = k8sClient.Status().Update(ctx, pod)
381+
Expect(err).To(BeNil(), "Failed to update pod status")
382+
383+
Eventually(
384+
func() bool {
385+
check := new(v1alpha1.EphemeralRunner)
386+
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, check)
387+
return kerrors.IsNotFound(err)
388+
},
389+
ephemeralRunnerTimeout,
390+
ephemeralRunnerInterval,
391+
).Should(BeTrue(), "Ephemeral runner should eventually be deleted")
392+
})
393+
394+
It("It should treat pod failed with runner container exit 0 as success with no job id", func() {
395+
pod := new(corev1.Pod)
396+
Eventually(
397+
func() error {
398+
if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil {
399+
return err
400+
}
401+
return nil
402+
},
403+
ephemeralRunnerTimeout,
404+
ephemeralRunnerInterval,
405+
).Should(Succeed(), "failed to get pod")
406+
407+
pod.Status.Phase = corev1.PodFailed
408+
pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{
409+
Name: v1alpha1.EphemeralRunnerContainerName,
410+
State: corev1.ContainerState{
411+
Terminated: &corev1.ContainerStateTerminated{
412+
ExitCode: 0,
413+
},
414+
},
415+
})
416+
err := k8sClient.Status().Update(ctx, pod)
417+
Expect(err).To(BeNil(), "Failed to update pod status")
418+
419+
Eventually(
420+
func() bool {
421+
check := new(v1alpha1.EphemeralRunner)
422+
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, check)
423+
return kerrors.IsNotFound(err)
424+
},
425+
ephemeralRunnerTimeout,
426+
ephemeralRunnerInterval,
427+
).Should(BeTrue(), "Ephemeral runner should eventually be deleted")
428+
})
429+
430+
It("It should mark as failed when job is not assigned and pod is failed", func() {
431+
er := new(v1alpha1.EphemeralRunner)
432+
Eventually(func() error {
433+
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er)
434+
},
435+
ephemeralRunnerTimeout,
436+
ephemeralRunnerInterval,
437+
).Should(Succeed(), "failed to get ephemeral runner")
438+
439+
pod := new(corev1.Pod)
440+
Eventually(
441+
func() error {
442+
if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil {
443+
return err
444+
}
445+
return nil
446+
},
447+
ephemeralRunnerTimeout,
448+
ephemeralRunnerInterval,
449+
).Should(Succeed(), "failed to get pod")
450+
451+
pod.Status.Phase = corev1.PodFailed
452+
oldPodUID := pod.UID
453+
pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{
454+
Name: v1alpha1.EphemeralRunnerContainerName,
455+
State: corev1.ContainerState{
456+
Terminated: &corev1.ContainerStateTerminated{
457+
ExitCode: 1,
458+
},
459+
},
460+
})
461+
462+
err := k8sClient.Status().Update(ctx, pod)
463+
Expect(err).To(BeNil(), "Failed to update pod status")
464+
465+
Eventually(
466+
func() (int, error) {
467+
updated := new(v1alpha1.EphemeralRunner)
468+
err := k8sClient.Get(
469+
ctx,
470+
client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace},
471+
updated,
472+
)
473+
if err != nil {
474+
return 0, err
475+
}
476+
return len(updated.Status.Failures), nil
477+
},
478+
ephemeralRunnerTimeout,
479+
ephemeralRunnerInterval,
480+
).Should(BeEquivalentTo(1))
481+
482+
Eventually(
483+
func() (bool, error) {
484+
newPod := new(corev1.Pod)
485+
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, newPod)
486+
if err != nil {
487+
return false, err
488+
}
489+
return newPod.UID != oldPodUID, nil
490+
},
491+
ephemeralRunnerTimeout,
492+
ephemeralRunnerInterval,
493+
).Should(BeTrue(), "Pod should be re-created")
494+
})
495+
264496
It("It should failed if a pod template is invalid", func() {
265497
invalideEphemeralRunner := newExampleRunner("invalid-ephemeral-runner", autoscalingNS.Name, configSecret.Name)
266498
invalideEphemeralRunner.Spec.Spec.PriorityClassName = "notexist"

0 commit comments

Comments
 (0)