Skip to content

Commit a26d772

Browse files
Merge pull request openshift#209 from jeefy/138
SDCICD-138: Add additional pod health checks to runner jobs
2 parents 4106beb + a9d4356 commit a26d772

File tree

2 files changed

+21
-7
lines changed

2 files changed

+21
-7
lines changed

pkg/runner/pod.go

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@ import (
1818
const (
1919
configMapCreateTimeout = 30 * time.Second
2020
podCreateTimeout = 90 * time.Second
21+
podPendingTimeout = 20 // 20 Iterations
22+
fastPoll = 5 * time.Second
23+
slowPoll = 15 * time.Second
2124

2225
resultsPort = 8000
2326
resultsPortName = "results"
@@ -111,7 +114,7 @@ func (r *Runner) createPod() (pod *kubev1.Pod, err error) {
111114
return nil, fmt.Errorf("error creating ConfigMap: %v", err)
112115
}
113116

114-
err = wait.PollImmediate(5*time.Second, configMapCreateTimeout, func() (done bool, err error) {
117+
err = wait.PollImmediate(fastPoll, configMapCreateTimeout, func() (done bool, err error) {
115118
if configMap, err = r.Kube.CoreV1().ConfigMaps(r.Namespace).Get(configMap.Name, metav1.GetOptions{}); err != nil {
116119
log.Printf("Error creating %s config map: %v", configMap.Name, err)
117120
}
@@ -144,7 +147,7 @@ func (r *Runner) createPod() (pod *kubev1.Pod, err error) {
144147

145148
// retry until Pod can be created or timeout occurs
146149
var createdPod *kubev1.Pod
147-
err = wait.PollImmediate(5*time.Second, podCreateTimeout, func() (done bool, err error) {
150+
err = wait.PollImmediate(fastPoll, podCreateTimeout, func() (done bool, err error) {
148151
if createdPod, err = r.Kube.CoreV1().Pods(r.Namespace).Create(pod); err != nil {
149152
log.Printf("Error creating %s runner Pod: %v", r.Name, err)
150153
}
@@ -154,17 +157,22 @@ func (r *Runner) createPod() (pod *kubev1.Pod, err error) {
154157
}
155158

156159
func (r *Runner) waitForPodRunning(pod *kubev1.Pod) error {
157-
return wait.PollImmediate(10*time.Second, 3*time.Minute, func() (done bool, err error) {
160+
var pendingCount int = 0
161+
return wait.PollImmediate(fastPoll, 3*time.Minute, func() (done bool, err error) {
158162
pod, err = r.Kube.CoreV1().Pods(pod.Namespace).Get(pod.Name, metav1.GetOptions{})
159163
if err != nil && !kerror.IsNotFound(err) {
160164
return
161165
} else if pod == nil {
162166
err = errors.New("pod can't be nil")
163-
} else if pod.Status.Phase == kubev1.PodFailed {
164-
err = errors.New("failed waiting for Pod: the Pod has failed")
167+
} else if pod.Status.Phase == kubev1.PodFailed || pod.Status.Phase == kubev1.PodUnknown {
168+
err = fmt.Errorf("failed waiting for Pod: the Pod has a phase of %s", pod.Status.Phase)
165169
} else if pod.Status.Phase == kubev1.PodRunning {
166170
done = true
167171
} else {
172+
pendingCount++
173+
if pendingCount > podPendingTimeout {
174+
err = errors.New("timed out waiting for pod to start")
175+
}
168176
r.Printf("Waiting for Pod '%s/%s' to start Running...", pod.Namespace, pod.Name)
169177
}
170178
return

pkg/runner/service.go

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,8 @@ func (r *Runner) createService(pod *kubev1.Pod) (svc *kubev1.Service, err error)
3333

3434
func (r *Runner) waitForCompletion(timeoutInSeconds int) error {
3535
var endpoints *kubev1.Endpoints
36-
return wait.PollImmediate(15*time.Second, time.Duration(timeoutInSeconds)*time.Second, func() (done bool, err error) {
36+
var pendingCount int = 0
37+
return wait.PollImmediate(slowPoll, time.Duration(timeoutInSeconds)*time.Second, func() (done bool, err error) {
3738
endpoints, err = r.Kube.CoreV1().Endpoints(r.svc.Namespace).Get(r.svc.Name, metav1.GetOptions{})
3839
if err != nil && !kerror.IsNotFound(err) {
3940
r.Printf("Encountered error getting endpoint '%s/%s': %v", r.svc.Namespace, r.svc.Name, err)
@@ -50,11 +51,16 @@ func (r *Runner) waitForCompletion(timeoutInSeconds int) error {
5051
return false, err
5152
}
5253
for _, pod := range pods.Items {
53-
if pod.Status.Phase == kubev1.PodFailed {
54+
if pod.Status.Phase == kubev1.PodFailed || pod.Status.Phase == kubev1.PodUnknown {
5455
r.Printf("Pod entered error state while waiting for endpoint: %+v", pod.Status)
5556
return false, fmt.Errorf("pod failed while waiting for endpoints")
5657
} else if pod.Status.Phase == kubev1.PodSucceeded {
5758
return true, nil
59+
} else if pod.Status.Phase == kubev1.PodPending {
60+
pendingCount++
61+
if pendingCount > podPendingTimeout {
62+
return false, fmt.Errorf("timed out waiting for pod to start")
63+
}
5864
}
5965
}
6066
r.Printf("Waiting for test results using Endpoint '%s/%s'...", endpoints.Namespace, endpoints.Name)

0 commit comments

Comments
 (0)