Skip to content

Commit 5321e5e

Browse files
authored
feat: cluster/component healh backend & resources update (#976)
1 parent 50c6a42 commit 5321e5e

File tree

7 files changed

+222
-13
lines changed

7 files changed

+222
-13
lines changed

bundle/manifests/observability-operator.clusterserviceversion.yaml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -409,6 +409,13 @@ spec:
409409
- get
410410
- list
411411
- watch
412+
- apiGroups:
413+
- config.openshift.io
414+
resources:
415+
- clusteroperators
416+
verbs:
417+
- get
418+
- list
412419
- apiGroups:
413420
- config.openshift.io
414421
resources:
@@ -438,6 +445,13 @@ spec:
438445
- get
439446
- list
440447
- watch
448+
- apiGroups:
449+
- kubevirt.io
450+
resources:
451+
- kubevirts
452+
verbs:
453+
- get
454+
- list
441455
- apiGroups:
442456
- loki.grafana.com
443457
resources:
@@ -454,6 +468,13 @@ spec:
454468
verbs:
455469
- get
456470
- list
471+
- apiGroups:
472+
- machineconfiguration.openshift.io
473+
resources:
474+
- machineconfigpools
475+
verbs:
476+
- get
477+
- list
457478
- apiGroups:
458479
- monitoring.coreos.com
459480
resourceNames:

deploy/operator/observability-operator-cluster-role.yaml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,13 @@ rules:
9292
- get
9393
- list
9494
- watch
95+
- apiGroups:
96+
- config.openshift.io
97+
resources:
98+
- clusteroperators
99+
verbs:
100+
- get
101+
- list
95102
- apiGroups:
96103
- config.openshift.io
97104
resources:
@@ -121,6 +128,13 @@ rules:
121128
- get
122129
- list
123130
- watch
131+
- apiGroups:
132+
- kubevirt.io
133+
resources:
134+
- kubevirts
135+
verbs:
136+
- get
137+
- list
124138
- apiGroups:
125139
- loki.grafana.com
126140
resources:
@@ -137,6 +151,13 @@ rules:
137151
verbs:
138152
- get
139153
- list
154+
- apiGroups:
155+
- machineconfiguration.openshift.io
156+
resources:
157+
- machineconfigpools
158+
verbs:
159+
- get
160+
- list
140161
- apiGroups:
141162
- monitoring.coreos.com
142163
resourceNames:

pkg/controllers/uiplugin/components.go

Lines changed: 60 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -121,15 +121,30 @@ func pluginComponentReconcilers(plugin *uiv1alpha1.UIPlugin, pluginInfo UIPlugin
121121
monitoringConfig.Incidents != nil &&
122122
monitoringConfig.Incidents.Enabled &&
123123
pluginInfo.HealthAnalyzerImage != ""
124+
125+
healthAnalyzerEnabled := monitoringConfig != nil &&
126+
monitoringConfig.ClusterHealthAnalyzer != nil &&
127+
monitoringConfig.ClusterHealthAnalyzer.Enabled &&
128+
pluginInfo.HealthAnalyzerImage != ""
129+
130+
deployHealthAnalyzer := incidentsEnabled || healthAnalyzerEnabled
131+
132+
components = append(components,
133+
reconciler.NewOptionalUpdater(componentsHealthClusterRole("components-health-view"), plugin, deployHealthAnalyzer),
134+
reconciler.NewOptionalUpdater(newClusterRoleBinding(namespace, serviceAccountName, "components-health-view", plugin.Name+"-"+"components-health-view"), plugin, deployHealthAnalyzer),
135+
reconciler.NewOptionalUpdater(newComponentHealthConfig(namespace), plugin, deployHealthAnalyzer),
136+
)
137+
124138
components = append(components,
125-
reconciler.NewOptionalUpdater(newClusterRoleBinding(namespace, serviceAccountName, monitorClusterroleName, plugin.Name+"-"+monitorClusterroleName), plugin, incidentsEnabled),
126-
reconciler.NewOptionalUpdater(newClusterRoleBinding(namespace, serviceAccountName, "system:auth-delegator", serviceAccountName+"-system-auth-delegator"), plugin, incidentsEnabled),
127-
reconciler.NewOptionalUpdater(newAlertManagerViewRoleBinding(serviceAccountName, namespace), plugin, incidentsEnabled),
128-
reconciler.NewOptionalUpdater(newHealthAnalyzerPrometheusRole(namespace), plugin, incidentsEnabled),
129-
reconciler.NewOptionalUpdater(newHealthAnalyzerPrometheusRoleBinding(namespace), plugin, incidentsEnabled),
130-
reconciler.NewOptionalUpdater(newHealthAnalyzerService(namespace), plugin, incidentsEnabled),
131-
reconciler.NewOptionalUpdater(newHealthAnalyzerDeployment(namespace, serviceAccountName, pluginInfo), plugin, incidentsEnabled),
132-
reconciler.NewOptionalUpdater(newHealthAnalyzerServiceMonitor(namespace), plugin, incidentsEnabled),
139+
reconciler.NewOptionalUpdater(newClusterRoleBinding(namespace, serviceAccountName, "cluster-monitoring-view", plugin.Name+"cluster-monitoring-view"), plugin, deployHealthAnalyzer),
140+
reconciler.NewOptionalUpdater(newClusterRoleBinding(namespace, serviceAccountName, "system:auth-delegator", serviceAccountName+"-system-auth-delegator"), plugin, deployHealthAnalyzer),
141+
reconciler.NewOptionalUpdater(newAlertManagerViewRoleBinding(serviceAccountName, namespace), plugin, deployHealthAnalyzer),
142+
reconciler.NewOptionalUpdater(newHealthAnalyzerPrometheusRole(namespace), plugin, deployHealthAnalyzer),
143+
reconciler.NewOptionalUpdater(newHealthAnalyzerPrometheusRoleBinding(namespace), plugin, deployHealthAnalyzer),
144+
reconciler.NewOptionalUpdater(newHealthAnalyzerService(namespace), plugin, deployHealthAnalyzer),
145+
reconciler.NewOptionalUpdater(newHealthAnalyzerDeployment(namespace, serviceAccountName, pluginInfo.HealthAnalyzerImage),
146+
plugin, deployHealthAnalyzer),
147+
reconciler.NewOptionalUpdater(newHealthAnalyzerServiceMonitor(namespace), plugin, deployHealthAnalyzer),
133148
)
134149

135150
persesServiceAccountName := "perses" + serviceAccountSuffix
@@ -436,6 +451,43 @@ func newService(info UIPluginInfo, namespace string) *corev1.Service {
436451
}
437452
}
438453

454+
// componentsHealthClusterRole creates a new clusterrole with the provided name.
455+
// The clusterrole has read permissions to the cluster resources and it is required
456+
// for the component health evaluation.
457+
func componentsHealthClusterRole(name string) *rbacv1.ClusterRole {
458+
return &rbacv1.ClusterRole{
459+
TypeMeta: metav1.TypeMeta{
460+
APIVersion: rbacv1.SchemeGroupVersion.String(),
461+
Kind: "ClusterRole",
462+
},
463+
ObjectMeta: metav1.ObjectMeta{
464+
Name: name,
465+
},
466+
Rules: []rbacv1.PolicyRule{
467+
{
468+
APIGroups: []string{""},
469+
Resources: []string{"nodes"},
470+
Verbs: []string{"get", "list"},
471+
},
472+
{
473+
APIGroups: []string{"config.openshift.io"},
474+
Resources: []string{"clusteroperators"},
475+
Verbs: []string{"get", "list"},
476+
},
477+
{
478+
APIGroups: []string{"machineconfiguration.openshift.io"},
479+
Resources: []string{"machineconfigpools"},
480+
Verbs: []string{"get", "list"},
481+
},
482+
{
483+
APIGroups: []string{"kubevirt.io"},
484+
Resources: []string{"kubevirts"},
485+
Verbs: []string{"get", "list"},
486+
},
487+
},
488+
}
489+
}
490+
439491
func newKorrel8rDeployment(name string, namespace string, info UIPluginInfo) *appsv1.Deployment {
440492
volumes := []corev1.Volume{
441493
{
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
# Default definition of the component tree used to evaluate component health
2+
# by the cluster-health-analyzer.
3+
components:
4+
- name: control-plane
5+
children:
6+
- name: nodes
7+
objects:
8+
- resource: nodes
9+
selectors:
10+
- matchLabels:
11+
node-role.kubernetes.io/control-plane: []
12+
- resource: machineconfigpools
13+
group: machineconfiguration.openshift.io
14+
selectors:
15+
- matchLabels:
16+
pools.operator.machineconfiguration.openshift.io/master: []
17+
- name: capacity
18+
children:
19+
- name: cpu
20+
alerts:
21+
selectors:
22+
- matchLabels:
23+
alertname: ["KubeCPUOvercommit","HighOverallControlPlaneCPU", "ExtremelyHighIndividualControlPlaneCPU"]
24+
- name: memory
25+
alerts:
26+
selectors:
27+
- matchLabels:
28+
alertname: ["HighOverallControlPlaneMemory", "ExtremelyHighIndividualControlPlaneMemory", "SystemMemoryExceedsReservation"]
29+
- name: operators
30+
children:
31+
- name: etcd
32+
alerts:
33+
selectors:
34+
- matchLabels:
35+
namespace: ["openshift-etcd","openshift-etcd-operator"]
36+
- name: addons
37+
children:
38+
- name: kubevirt
39+
alerts:
40+
selectors:
41+
- matchLabels:
42+
kubernetes_operator_part_of: ["kubevirt"]
43+
- matchLabels:
44+
namespace: ["openshift-cnv"]
45+
objects:
46+
- group: kubevirt.io
47+
resource: kubevirts

pkg/controllers/uiplugin/controller.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,9 @@ const (
106106
//+kubebuilder:rbac:groups=authentication.k8s.io,resources=tokenreviews,verbs=create
107107
//+kubebuilder:rbac:groups=authorization.k8s.io,resources=subjectaccessreviews,verbs=create
108108
//+kubebuilder:rbac:groups=monitoring.coreos.com,resources=servicemonitors,verbs=get;create;update;patch;delete
109+
//+kubebuilder:rbac:groups=config.openshift.io,resources=clusteroperators,verbs=get;list
110+
//+kubebuilder:rbac:groups=machineconfiguration.openshift.io,resources=machineconfigpools,verbs=get;list
111+
//+kubebuilder:rbac:groups=kubevirt.io,resources=kubevirts,verbs=get;list
109112

110113
const finalizerName = "uiplugin.observability.openshift.io/finalizer"
111114

pkg/controllers/uiplugin/health_analyzer.go

Lines changed: 51 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,28 @@
11
package uiplugin
22

33
import (
4+
_ "embed"
5+
46
monv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
57
appsv1 "k8s.io/api/apps/v1"
68
corev1 "k8s.io/api/core/v1"
9+
v1 "k8s.io/api/core/v1"
710
rbacv1 "k8s.io/api/rbac/v1"
811
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
912
"k8s.io/apimachinery/pkg/util/intstr"
1013
"k8s.io/utils/ptr"
1114
)
1215

1316
const (
14-
name = "health-analyzer"
15-
volumeMountName = name + "-tls"
17+
name = "health-analyzer"
18+
volumeMountName = name + "-tls"
19+
componentConfigVolumeName = "components-health-config"
20+
componentConfigMapName = "components-config"
1621
)
1722

23+
//go:embed config/health-analyzer.yaml
24+
var componentHealthConfig string
25+
1826
func newHealthAnalyzerPrometheusRole(namespace string) *rbacv1.Role {
1927
role := &rbacv1.Role{
2028
TypeMeta: metav1.TypeMeta{
@@ -94,7 +102,10 @@ func newHealthAnalyzerService(namespace string) *corev1.Service {
94102
return service
95103
}
96104

97-
func newHealthAnalyzerDeployment(namespace string, serviceAccountName string, pluginInfo UIPluginInfo) *appsv1.Deployment {
105+
func newHealthAnalyzerDeployment(namespace string,
106+
serviceAccountName string,
107+
image string) *appsv1.Deployment {
108+
98109
deploy := &appsv1.Deployment{
99110
TypeMeta: metav1.TypeMeta{
100111
APIVersion: appsv1.SchemeGroupVersion.String(),
@@ -122,7 +133,7 @@ func newHealthAnalyzerDeployment(namespace string, serviceAccountName string, pl
122133
Containers: []corev1.Container{
123134
{
124135
Name: name,
125-
Image: pluginInfo.HealthAnalyzerImage,
136+
Image: image,
126137
ImagePullPolicy: corev1.PullAlways,
127138
Args: []string{
128139
"serve",
@@ -162,6 +173,11 @@ func newHealthAnalyzerDeployment(namespace string, serviceAccountName string, pl
162173
Name: volumeMountName,
163174
ReadOnly: true,
164175
},
176+
{
177+
Name: componentConfigVolumeName,
178+
MountPath: "/etc/config",
179+
ReadOnly: true,
180+
},
165181
},
166182
},
167183
},
@@ -174,6 +190,16 @@ func newHealthAnalyzerDeployment(namespace string, serviceAccountName string, pl
174190
},
175191
},
176192
},
193+
{
194+
Name: componentConfigVolumeName,
195+
VolumeSource: corev1.VolumeSource{
196+
ConfigMap: &corev1.ConfigMapVolumeSource{
197+
LocalObjectReference: corev1.LocalObjectReference{
198+
Name: componentConfigMapName,
199+
},
200+
},
201+
},
202+
},
177203
},
178204
},
179205
},
@@ -218,3 +244,24 @@ func newHealthAnalyzerServiceMonitor(namespace string) *monv1.ServiceMonitor {
218244

219245
return serviceMonitor
220246
}
247+
248+
// newComponentHealthConfig creates a new ConfigMap
249+
// that defines the components whose health is evaluated.
250+
func newComponentHealthConfig(namespace string) *v1.ConfigMap {
251+
cm := v1.ConfigMap{
252+
TypeMeta: metav1.TypeMeta{
253+
APIVersion: v1.SchemeGroupVersion.String(),
254+
Kind: "ConfigMap",
255+
},
256+
ObjectMeta: metav1.ObjectMeta{
257+
Namespace: namespace,
258+
Name: componentConfigMapName,
259+
Labels: componentLabels("monitoring"),
260+
},
261+
Data: map[string]string{
262+
"components.yaml": componentHealthConfig,
263+
},
264+
}
265+
266+
return &cm
267+
}

pkg/controllers/uiplugin/monitoring.go

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,19 @@ func validatePersesConfig(config *uiv1alpha1.MonitoringConfig) bool {
4444
return config.Perses != nil && config.Perses.Enabled
4545
}
4646

47+
func validateHealthanalyzerConfig(config *uiv1alpha1.MonitoringConfig, clusterVersion string) bool {
48+
enabled := config.ClusterHealthAnalyzer != nil &&
49+
config.ClusterHealthAnalyzer.Enabled
50+
51+
if !strings.HasPrefix(clusterVersion, "v") {
52+
clusterVersion = "v" + clusterVersion
53+
}
54+
canonicalClusterVersion := fmt.Sprintf("%s-0", semver.Canonical(clusterVersion))
55+
minClusterVersionMet := semver.Compare(canonicalClusterVersion, "v4.19.0-0") >= 0
56+
57+
return enabled && minClusterVersionMet
58+
}
59+
4760
func validateIncidentsConfig(config *uiv1alpha1.MonitoringConfig, clusterVersion string) bool {
4861
enabled := config.Incidents != nil && config.Incidents.Enabled
4962

@@ -191,8 +204,9 @@ func createMonitoringPluginInfo(plugin *uiv1alpha1.UIPlugin, namespace, name, im
191204
isValidAcmConfig := validateACMConfig(config)
192205
isValidPersesConfig := validatePersesConfig(config)
193206
isValidIncidentsConfig := validateIncidentsConfig(config, clusterVersion)
207+
isValidHealthAnalyzerConfig := validateHealthanalyzerConfig(config, clusterVersion)
194208

195-
atLeastOneValidConfig := isValidAcmConfig || isValidPersesConfig || isValidIncidentsConfig
209+
atLeastOneValidConfig := isValidAcmConfig || isValidPersesConfig || isValidIncidentsConfig || isValidHealthAnalyzerConfig
196210

197211
pluginInfo := getBasePluginInfo(namespace, name, image)
198212
if !atLeastOneValidConfig {
@@ -215,6 +229,10 @@ func createMonitoringPluginInfo(plugin *uiv1alpha1.UIPlugin, namespace, name, im
215229
pluginInfo.HealthAnalyzerImage = healthAnalyzerImage
216230
features = append(features, "incidents")
217231
}
232+
if isValidHealthAnalyzerConfig {
233+
pluginInfo.HealthAnalyzerImage = healthAnalyzerImage
234+
features = append(features, "cluster-health-analyzer")
235+
}
218236
addFeatureFlags(pluginInfo, features)
219237

220238
return pluginInfo, nil

0 commit comments

Comments
 (0)