Revert "Revert "Use healthcheck cron job to determine cluster readiness""

whereswaldon · whereswaldon · commit ded6519493ab · 2021-04-09T12:57:10.000-04:00
This reverts commit 85fc03b.
diff --git a/pkg/common/cluster/clusterutil.go b/pkg/common/cluster/clusterutil.go
@@ -214,41 +214,50 @@ func waitForClusterReadyWithOverrideAndExpectedNumberOfNodes(clusterID string, l
 	return nil
 }
 
-// PollClusterHealth looks at CVO data to determine if a cluster is alive/healthy or not
+// ClusterConfig returns the rest API config for a given cluster as well as the provider it
+// inferred to discover the config.
 // param clusterID: If specified, Provider will be discovered through OCM. If the empty string,
 // 		assume we are running in a cluster and use in-cluster REST config instead.
-func PollClusterHealth(clusterID string, logger *log.Logger) (status bool, failures []string, err error) {
-	logger = logging.CreateNewStdLoggerOrUseExistingLogger(logger)
-
-	logger.Print("Polling Cluster Health...\n")
-
-	var restConfig *rest.Config
-	var providerType string
-
+func ClusterConfig(clusterID string) (restConfig *rest.Config, providerType string, err error) {
 	if clusterID == "" {
 		if restConfig, err = rest.InClusterConfig(); err != nil {
-			logger.Printf("Error getting in-cluster REST config: %v\n", err)
-			return false, nil, nil
+			return nil, "", fmt.Errorf("error getting in-cluster rest config: %w", err)
 		}
 
 		// FIXME: Is there a way to discover this from within the cluster?
 		// For now, ocm and rosa behave the same, so hardcode either.
 		providerType = "ocm"
+		return
 
-	} else {
-		provider, err := providers.ClusterProvider()
+	}
+	provider, err := providers.ClusterProvider()
 
-		if err != nil {
-			return false, nil, fmt.Errorf("error getting cluster provisioning client: %v", err)
-		}
+	if err != nil {
+		return nil, "", fmt.Errorf("error getting cluster provisioning client: %w", err)
+	}
+	providerType = provider.Type()
 
-		restConfig, err = getRestConfig(provider, clusterID)
-		if err != nil {
-			logger.Printf("Error generating Rest Config: %v\n", err)
-			return false, nil, nil
-		}
+	restConfig, err = getRestConfig(provider, clusterID)
+	if err != nil {
+
+		return nil, "", fmt.Errorf("error generating rest config: %w", err)
+	}
+
+	return
+}
 
-		providerType = provider.Type()
+// PollClusterHealth looks at CVO data to determine if a cluster is alive/healthy or not
+// param clusterID: If specified, Provider will be discovered through OCM. If the empty string,
+// 		assume we are running in a cluster and use in-cluster REST config instead.
+func PollClusterHealth(clusterID string, logger *log.Logger) (status bool, failures []string, err error) {
+	logger = logging.CreateNewStdLoggerOrUseExistingLogger(logger)
+
+	logger.Print("Polling Cluster Health...\n")
+
+	restConfig, providerType, err := ClusterConfig(clusterID)
+	if err != nil {
+		logger.Printf("Error getting cluster config: %v\n", err)
+		return false, nil, nil
 	}
 
 	kubeClient, err := kubernetes.NewForConfig(restConfig)
diff --git a/pkg/common/cluster/healthchecks/healthcheckjob.go b/pkg/common/cluster/healthchecks/healthcheckjob.go
@@ -0,0 +1,71 @@
+package healthchecks
+
+import (
+	"context"
+	"fmt"
+	"log"
+
+	"github.com/openshift/osde2e/pkg/common/logging"
+	batchv1 "k8s.io/api/batch/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/watch"
+	"k8s.io/client-go/kubernetes"
+)
+
+// CheckHealthcheckJob uses the `osd-cluster-ready` healthcheck job to determine cluster readiness. If the cluster
+// is not ready, it will return an error.
+func CheckHealthcheckJob(k8sClient *kubernetes.Clientset, ctx context.Context, logger *log.Logger) error {
+	logger = logging.CreateNewStdLoggerOrUseExistingLogger(logger)
+
+	logger.Print("Checking whether cluster is healthy before proceeding...")
+
+	bv1C := k8sClient.BatchV1()
+	namespace := "openshift-monitoring"
+	name := "osd-cluster-ready"
+	jobs, err := bv1C.Jobs(namespace).List(ctx, metav1.ListOptions{})
+	if err != nil {
+		return fmt.Errorf("failed listing jobs: %w", err)
+	}
+	for _, job := range jobs.Items {
+		if job.Name != name {
+			continue
+		}
+		if job.Status.Succeeded > 0 {
+			log.Println("Healthcheck job has already succeeded")
+			return nil
+		}
+		log.Println("Healthcheck job has not yet succeeded, watching...")
+	}
+	watcher, err := bv1C.Jobs(namespace).Watch(ctx, metav1.ListOptions{
+		ResourceVersion: jobs.ResourceVersion,
+		FieldSelector:   "metadata.name=osd-cluster-ready",
+	})
+	if err != nil {
+		return fmt.Errorf("failed watching job: %w", err)
+	}
+	for {
+		select {
+		case event := <-watcher.ResultChan():
+			switch event.Type {
+			case watch.Added:
+				fallthrough
+			case watch.Modified:
+				job := event.Object.(*batchv1.Job)
+				if job.Status.Succeeded > 0 {
+					return nil
+				}
+				if job.Status.Failed > 0 {
+					return fmt.Errorf("cluster readiness job failed")
+				}
+			case watch.Deleted:
+				return fmt.Errorf("cluster readiness job deleted before becoming ready (this should never happen)")
+			case watch.Error:
+				return fmt.Errorf("watch returned error event: %v", event)
+			default:
+				logger.Printf("Unrecognized event type while watching for healthcheck job updates: %v", event.Type)
+			}
+		case <-ctx.Done():
+			return fmt.Errorf("healtcheck watch context cancelled while still waiting for success")
+		}
+	}
+}
diff --git a/pkg/common/config/config.go b/pkg/common/config/config.go
@@ -175,6 +175,12 @@ var Tests = struct {
 	// Env: SKIP_CLUSTER_HEALTH_CHECKS
 	SkipClusterHealthChecks string
 
+	// ClusterHealthChecksTimeout defines the duration for which the harness will
+	// wait for the cluster to indicate it is healthy before cancelling the test
+	// run. This value should be formatted for use with time.ParseDuration.
+	// Env: CLUSTER_HEALTH_CHECKS_TIMEOUT
+	ClusterHealthChecksTimeout string
+
 	// MetricsBucket is the bucket that metrics data will be uploaded to.
 	// Env: METRICS_BUCKET
 	MetricsBucket string
@@ -184,16 +190,17 @@ var Tests = struct {
 	ServiceAccount string
 }{
 
-	PollingTimeout:            "tests.pollingTimeout",
-	GinkgoSkip:                "tests.ginkgoSkip",
-	GinkgoFocus:               "tests.focus",
-	TestsToRun:                "tests.testsToRun",
-	SuppressSkipNotifications: "tests.suppressSkipNotifications",
-	CleanRuns:                 "tests.cleanRuns",
-	OperatorSkip:              "tests.operatorSkip",
-	SkipClusterHealthChecks:   "tests.skipClusterHealthChecks",
-	MetricsBucket:             "tests.metricsBucket",
-	ServiceAccount:            "tests.serviceAccount",
+	PollingTimeout:             "tests.pollingTimeout",
+	GinkgoSkip:                 "tests.ginkgoSkip",
+	GinkgoFocus:                "tests.focus",
+	TestsToRun:                 "tests.testsToRun",
+	SuppressSkipNotifications:  "tests.suppressSkipNotifications",
+	CleanRuns:                  "tests.cleanRuns",
+	OperatorSkip:               "tests.operatorSkip",
+	SkipClusterHealthChecks:    "tests.skipClusterHealthChecks",
+	MetricsBucket:              "tests.metricsBucket",
+	ServiceAccount:             "tests.serviceAccount",
+	ClusterHealthChecksTimeout: "tests.clusterHealthChecksTimeout",
 }
 
 // Cluster config keys.
@@ -537,6 +544,9 @@ func init() {
 	viper.SetDefault(Tests.SkipClusterHealthChecks, false)
 	viper.BindEnv(Tests.OperatorSkip, "SKIP_CLUSTER_HEALTH_CHECKS")
 
+	viper.SetDefault(Tests.ClusterHealthChecksTimeout, "2h")
+	viper.BindEnv(Tests.ClusterHealthChecksTimeout, "CLUSTER_HEALTH_CHECKS_TIMEOUT")
+
 	viper.SetDefault(Tests.MetricsBucket, "osde2e-metrics")
 	viper.BindEnv(Tests.MetricsBucket, "METRICS_BUCKET")
 
diff --git a/pkg/e2e/e2e.go b/pkg/e2e/e2e.go
@@ -4,6 +4,7 @@ package e2e
 import (
 	"bytes"
 	"compress/gzip"
+	"context"
 	"encoding/json"
 	"encoding/xml"
 	"fmt"
@@ -19,6 +20,7 @@ import (
 	"github.com/hpcloud/tail"
 	junit "github.com/joshdk/go-junit"
 	vegeta "github.com/tsenart/vegeta/lib"
+	"k8s.io/client-go/kubernetes"
 
 	pd "github.com/PagerDuty/go-pagerduty"
 	"github.com/onsi/ginkgo"
@@ -31,6 +33,7 @@ import (
 	"github.com/openshift/osde2e/pkg/common/aws"
 	"github.com/openshift/osde2e/pkg/common/cluster"
 	clusterutil "github.com/openshift/osde2e/pkg/common/cluster"
+	"github.com/openshift/osde2e/pkg/common/cluster/healthchecks"
 	"github.com/openshift/osde2e/pkg/common/clusterproperties"
 	"github.com/openshift/osde2e/pkg/common/config"
 	"github.com/openshift/osde2e/pkg/common/events"
@@ -121,14 +124,35 @@ func beforeSuite() bool {
 			log.Printf("Error while adding upgrade version property to cluster via OCM: %v", err)
 		}
 
-		err = clusterutil.WaitForClusterReady(cluster.ID(), nil)
-		events.HandleErrorWithEvents(err, events.HealthCheckSuccessful, events.HealthCheckFailed)
-		if err != nil {
-			log.Printf("Cluster failed health check: %v", err)
-			getLogs()
-			return false
+		if viper.GetString(config.Tests.SkipClusterHealthChecks) != "true" {
+			clusterConfig, _, err := clusterutil.ClusterConfig(cluster.ID())
+			if err != nil {
+				log.Printf("Failed looking up cluster config for healthcheck: %v", err)
+				return false
+			}
+			kubeClient, err := kubernetes.NewForConfig(clusterConfig)
+			if err != nil {
+				log.Printf("Error generating Kube Clientset: %v\n", err)
+				return false
+			}
+			duration, err := time.ParseDuration(viper.GetString(config.Tests.ClusterHealthChecksTimeout))
+			if err != nil {
+				log.Printf("Failed parsing health check timeout: %v", err)
+				return false
+			}
+			ctx, cancel := context.WithTimeout(context.Background(), duration)
+			defer cancel()
+			err = healthchecks.CheckHealthcheckJob(kubeClient, ctx, nil)
+			events.HandleErrorWithEvents(err, events.HealthCheckSuccessful, events.HealthCheckFailed)
+			if err != nil {
+				log.Printf("Cluster failed health check: %v", err)
+				getLogs()
+				return false
+			}
+			log.Println("Cluster is healthy and ready for testing")
+		} else {
+			log.Println("Skipping health checks as requested")
 		}
-
 		if len(viper.GetString(config.Addons.IDs)) > 0 {
 			if viper.GetString(config.Provider) != "mock" {
 				err = installAddons()