Skip to content

Commit ded6519

Browse files
committed
Revert "Revert "Use healthcheck cron job to determine cluster readiness""
This reverts commit 85fc03b.
1 parent ec3200a commit ded6519

File tree

4 files changed

+153
-39
lines changed

4 files changed

+153
-39
lines changed

pkg/common/cluster/clusterutil.go

Lines changed: 31 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -214,41 +214,50 @@ func waitForClusterReadyWithOverrideAndExpectedNumberOfNodes(clusterID string, l
214214
return nil
215215
}
216216

217-
// PollClusterHealth looks at CVO data to determine if a cluster is alive/healthy or not
217+
// ClusterConfig returns the rest API config for a given cluster as well as the provider it
218+
// inferred to discover the config.
218219
// param clusterID: If specified, Provider will be discovered through OCM. If the empty string,
219220
// assume we are running in a cluster and use in-cluster REST config instead.
220-
func PollClusterHealth(clusterID string, logger *log.Logger) (status bool, failures []string, err error) {
221-
logger = logging.CreateNewStdLoggerOrUseExistingLogger(logger)
222-
223-
logger.Print("Polling Cluster Health...\n")
224-
225-
var restConfig *rest.Config
226-
var providerType string
227-
221+
func ClusterConfig(clusterID string) (restConfig *rest.Config, providerType string, err error) {
228222
if clusterID == "" {
229223
if restConfig, err = rest.InClusterConfig(); err != nil {
230-
logger.Printf("Error getting in-cluster REST config: %v\n", err)
231-
return false, nil, nil
224+
return nil, "", fmt.Errorf("error getting in-cluster rest config: %w", err)
232225
}
233226

234227
// FIXME: Is there a way to discover this from within the cluster?
235228
// For now, ocm and rosa behave the same, so hardcode either.
236229
providerType = "ocm"
230+
return
237231

238-
} else {
239-
provider, err := providers.ClusterProvider()
232+
}
233+
provider, err := providers.ClusterProvider()
240234

241-
if err != nil {
242-
return false, nil, fmt.Errorf("error getting cluster provisioning client: %v", err)
243-
}
235+
if err != nil {
236+
return nil, "", fmt.Errorf("error getting cluster provisioning client: %w", err)
237+
}
238+
providerType = provider.Type()
244239

245-
restConfig, err = getRestConfig(provider, clusterID)
246-
if err != nil {
247-
logger.Printf("Error generating Rest Config: %v\n", err)
248-
return false, nil, nil
249-
}
240+
restConfig, err = getRestConfig(provider, clusterID)
241+
if err != nil {
242+
243+
return nil, "", fmt.Errorf("error generating rest config: %w", err)
244+
}
245+
246+
return
247+
}
250248

251-
providerType = provider.Type()
249+
// PollClusterHealth looks at CVO data to determine if a cluster is alive/healthy or not
250+
// param clusterID: If specified, Provider will be discovered through OCM. If the empty string,
251+
// assume we are running in a cluster and use in-cluster REST config instead.
252+
func PollClusterHealth(clusterID string, logger *log.Logger) (status bool, failures []string, err error) {
253+
logger = logging.CreateNewStdLoggerOrUseExistingLogger(logger)
254+
255+
logger.Print("Polling Cluster Health...\n")
256+
257+
restConfig, providerType, err := ClusterConfig(clusterID)
258+
if err != nil {
259+
logger.Printf("Error getting cluster config: %v\n", err)
260+
return false, nil, nil
252261
}
253262

254263
kubeClient, err := kubernetes.NewForConfig(restConfig)
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
package healthchecks
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"log"
7+
8+
"github.com/openshift/osde2e/pkg/common/logging"
9+
batchv1 "k8s.io/api/batch/v1"
10+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
11+
"k8s.io/apimachinery/pkg/watch"
12+
"k8s.io/client-go/kubernetes"
13+
)
14+
15+
// CheckHealthcheckJob uses the `osd-cluster-ready` healthcheck job to determine cluster readiness. If the cluster
16+
// is not ready, it will return an error.
17+
func CheckHealthcheckJob(k8sClient *kubernetes.Clientset, ctx context.Context, logger *log.Logger) error {
18+
logger = logging.CreateNewStdLoggerOrUseExistingLogger(logger)
19+
20+
logger.Print("Checking whether cluster is healthy before proceeding...")
21+
22+
bv1C := k8sClient.BatchV1()
23+
namespace := "openshift-monitoring"
24+
name := "osd-cluster-ready"
25+
jobs, err := bv1C.Jobs(namespace).List(ctx, metav1.ListOptions{})
26+
if err != nil {
27+
return fmt.Errorf("failed listing jobs: %w", err)
28+
}
29+
for _, job := range jobs.Items {
30+
if job.Name != name {
31+
continue
32+
}
33+
if job.Status.Succeeded > 0 {
34+
log.Println("Healthcheck job has already succeeded")
35+
return nil
36+
}
37+
log.Println("Healthcheck job has not yet succeeded, watching...")
38+
}
39+
watcher, err := bv1C.Jobs(namespace).Watch(ctx, metav1.ListOptions{
40+
ResourceVersion: jobs.ResourceVersion,
41+
FieldSelector: "metadata.name=osd-cluster-ready",
42+
})
43+
if err != nil {
44+
return fmt.Errorf("failed watching job: %w", err)
45+
}
46+
for {
47+
select {
48+
case event := <-watcher.ResultChan():
49+
switch event.Type {
50+
case watch.Added:
51+
fallthrough
52+
case watch.Modified:
53+
job := event.Object.(*batchv1.Job)
54+
if job.Status.Succeeded > 0 {
55+
return nil
56+
}
57+
if job.Status.Failed > 0 {
58+
return fmt.Errorf("cluster readiness job failed")
59+
}
60+
case watch.Deleted:
61+
return fmt.Errorf("cluster readiness job deleted before becoming ready (this should never happen)")
62+
case watch.Error:
63+
return fmt.Errorf("watch returned error event: %v", event)
64+
default:
65+
logger.Printf("Unrecognized event type while watching for healthcheck job updates: %v", event.Type)
66+
}
67+
case <-ctx.Done():
68+
return fmt.Errorf("healtcheck watch context cancelled while still waiting for success")
69+
}
70+
}
71+
}

pkg/common/config/config.go

Lines changed: 20 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,12 @@ var Tests = struct {
175175
// Env: SKIP_CLUSTER_HEALTH_CHECKS
176176
SkipClusterHealthChecks string
177177

178+
// ClusterHealthChecksTimeout defines the duration for which the harness will
179+
// wait for the cluster to indicate it is healthy before cancelling the test
180+
// run. This value should be formatted for use with time.ParseDuration.
181+
// Env: CLUSTER_HEALTH_CHECKS_TIMEOUT
182+
ClusterHealthChecksTimeout string
183+
178184
// MetricsBucket is the bucket that metrics data will be uploaded to.
179185
// Env: METRICS_BUCKET
180186
MetricsBucket string
@@ -184,16 +190,17 @@ var Tests = struct {
184190
ServiceAccount string
185191
}{
186192

187-
PollingTimeout: "tests.pollingTimeout",
188-
GinkgoSkip: "tests.ginkgoSkip",
189-
GinkgoFocus: "tests.focus",
190-
TestsToRun: "tests.testsToRun",
191-
SuppressSkipNotifications: "tests.suppressSkipNotifications",
192-
CleanRuns: "tests.cleanRuns",
193-
OperatorSkip: "tests.operatorSkip",
194-
SkipClusterHealthChecks: "tests.skipClusterHealthChecks",
195-
MetricsBucket: "tests.metricsBucket",
196-
ServiceAccount: "tests.serviceAccount",
193+
PollingTimeout: "tests.pollingTimeout",
194+
GinkgoSkip: "tests.ginkgoSkip",
195+
GinkgoFocus: "tests.focus",
196+
TestsToRun: "tests.testsToRun",
197+
SuppressSkipNotifications: "tests.suppressSkipNotifications",
198+
CleanRuns: "tests.cleanRuns",
199+
OperatorSkip: "tests.operatorSkip",
200+
SkipClusterHealthChecks: "tests.skipClusterHealthChecks",
201+
MetricsBucket: "tests.metricsBucket",
202+
ServiceAccount: "tests.serviceAccount",
203+
ClusterHealthChecksTimeout: "tests.clusterHealthChecksTimeout",
197204
}
198205

199206
// Cluster config keys.
@@ -537,6 +544,9 @@ func init() {
537544
viper.SetDefault(Tests.SkipClusterHealthChecks, false)
538545
viper.BindEnv(Tests.OperatorSkip, "SKIP_CLUSTER_HEALTH_CHECKS")
539546

547+
viper.SetDefault(Tests.ClusterHealthChecksTimeout, "2h")
548+
viper.BindEnv(Tests.ClusterHealthChecksTimeout, "CLUSTER_HEALTH_CHECKS_TIMEOUT")
549+
540550
viper.SetDefault(Tests.MetricsBucket, "osde2e-metrics")
541551
viper.BindEnv(Tests.MetricsBucket, "METRICS_BUCKET")
542552

pkg/e2e/e2e.go

Lines changed: 31 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ package e2e
44
import (
55
"bytes"
66
"compress/gzip"
7+
"context"
78
"encoding/json"
89
"encoding/xml"
910
"fmt"
@@ -19,6 +20,7 @@ import (
1920
"github.com/hpcloud/tail"
2021
junit "github.com/joshdk/go-junit"
2122
vegeta "github.com/tsenart/vegeta/lib"
23+
"k8s.io/client-go/kubernetes"
2224

2325
pd "github.com/PagerDuty/go-pagerduty"
2426
"github.com/onsi/ginkgo"
@@ -31,6 +33,7 @@ import (
3133
"github.com/openshift/osde2e/pkg/common/aws"
3234
"github.com/openshift/osde2e/pkg/common/cluster"
3335
clusterutil "github.com/openshift/osde2e/pkg/common/cluster"
36+
"github.com/openshift/osde2e/pkg/common/cluster/healthchecks"
3437
"github.com/openshift/osde2e/pkg/common/clusterproperties"
3538
"github.com/openshift/osde2e/pkg/common/config"
3639
"github.com/openshift/osde2e/pkg/common/events"
@@ -121,14 +124,35 @@ func beforeSuite() bool {
121124
log.Printf("Error while adding upgrade version property to cluster via OCM: %v", err)
122125
}
123126

124-
err = clusterutil.WaitForClusterReady(cluster.ID(), nil)
125-
events.HandleErrorWithEvents(err, events.HealthCheckSuccessful, events.HealthCheckFailed)
126-
if err != nil {
127-
log.Printf("Cluster failed health check: %v", err)
128-
getLogs()
129-
return false
127+
if viper.GetString(config.Tests.SkipClusterHealthChecks) != "true" {
128+
clusterConfig, _, err := clusterutil.ClusterConfig(cluster.ID())
129+
if err != nil {
130+
log.Printf("Failed looking up cluster config for healthcheck: %v", err)
131+
return false
132+
}
133+
kubeClient, err := kubernetes.NewForConfig(clusterConfig)
134+
if err != nil {
135+
log.Printf("Error generating Kube Clientset: %v\n", err)
136+
return false
137+
}
138+
duration, err := time.ParseDuration(viper.GetString(config.Tests.ClusterHealthChecksTimeout))
139+
if err != nil {
140+
log.Printf("Failed parsing health check timeout: %v", err)
141+
return false
142+
}
143+
ctx, cancel := context.WithTimeout(context.Background(), duration)
144+
defer cancel()
145+
err = healthchecks.CheckHealthcheckJob(kubeClient, ctx, nil)
146+
events.HandleErrorWithEvents(err, events.HealthCheckSuccessful, events.HealthCheckFailed)
147+
if err != nil {
148+
log.Printf("Cluster failed health check: %v", err)
149+
getLogs()
150+
return false
151+
}
152+
log.Println("Cluster is healthy and ready for testing")
153+
} else {
154+
log.Println("Skipping health checks as requested")
130155
}
131-
132156
if len(viper.GetString(config.Addons.IDs)) > 0 {
133157
if viper.GetString(config.Provider) != "mock" {
134158
err = installAddons()

0 commit comments

Comments
 (0)