Skip to content

Commit 21e6368

Browse files
authored
NETOBSERV-2596: Make console plugin controller use health metadata for config, set some default rules as recording (#2388)
* Make console plugin controller use health metadata for config, set some default rules as recording - Refactor all alerts to implement a HealthRule interface - HealthRule provides the Annotations, RecordingName and the PrometheusRule - RecordingName now provided explicitly - Split logic between "builder" and "context" - Console plugin controller just dumps annotations to config - Change some defaults to Recording * Add kind label to metadata; rename owner->workload for user-exposed api Since these annotations are a user-exposed API, it is preferable to use a more k8s-standard terminology, ie "workload" instead of "owner". * fix todos * Move more rules as recording * metadata doc
1 parent 4b09e40 commit 21e6368

27 files changed

+1126
-1046
lines changed

api/flowcollector/v1beta2/flowcollector_defaults.go

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"time"
55

66
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
7+
"k8s.io/utils/ptr"
78
)
89

910
var (
@@ -48,6 +49,7 @@ var (
4849
DefaultHealthRules = []FLPHealthRule{
4950
{
5051
Template: HealthRulePacketDropsByKernel,
52+
Mode: ModeRecording,
5153
Variants: []HealthRuleVariant{
5254
{
5355
Thresholds: HealthRuleThresholds{
@@ -58,9 +60,10 @@ var (
5860
GroupBy: GroupByNamespace,
5961
},
6062
{
63+
Mode: ptr.To(ModeRecording),
6164
Thresholds: HealthRuleThresholds{
6265
Info: "5",
63-
Warning: "10",
66+
Warning: "15",
6467
},
6568
GroupBy: GroupByNode,
6669
},
@@ -71,7 +74,8 @@ var (
7174
Variants: []HealthRuleVariant{
7275
{
7376
Thresholds: HealthRuleThresholds{
74-
Warning: "5",
77+
Info: "5",
78+
Warning: "10",
7579
},
7680
GroupBy: GroupByNode,
7781
},
@@ -82,12 +86,12 @@ var (
8286
Variants: []HealthRuleVariant{
8387
{
8488
Thresholds: HealthRuleThresholds{
85-
Critical: "2",
89+
Warning: "2",
8690
},
8791
},
8892
{
8993
Thresholds: HealthRuleThresholds{
90-
Critical: "2",
94+
Warning: "2",
9195
},
9296
GroupBy: GroupByNode,
9397
},
@@ -112,6 +116,7 @@ var (
112116
},
113117
{
114118
Template: HealthRuleDNSNxDomain,
119+
Mode: ModeRecording,
115120
Variants: []HealthRuleVariant{
116121
{
117122
Thresholds: HealthRuleThresholds{
@@ -124,6 +129,7 @@ var (
124129
},
125130
{
126131
Template: HealthRuleNetpolDenied,
132+
Mode: ModeRecording,
127133
Variants: []HealthRuleVariant{
128134
{
129135
Thresholds: HealthRuleThresholds{
@@ -136,6 +142,7 @@ var (
136142
},
137143
{
138144
Template: HealthRuleLatencyHighTrend,
145+
Mode: ModeRecording,
139146
Variants: []HealthRuleVariant{
140147
{
141148
Thresholds: HealthRuleThresholds{
@@ -149,6 +156,7 @@ var (
149156
},
150157
{
151158
Template: HealthRuleExternalEgressHighTrend,
159+
Mode: ModeRecording,
152160
Variants: []HealthRuleVariant{
153161
{
154162
Thresholds: HealthRuleThresholds{
@@ -171,6 +179,7 @@ var (
171179
},
172180
{
173181
Template: HealthRuleExternalIngressHighTrend,
182+
Mode: ModeRecording,
174183
Variants: []HealthRuleVariant{
175184
{
176185
Thresholds: HealthRuleThresholds{
@@ -193,6 +202,7 @@ var (
193202
},
194203
{
195204
Template: HealthRuleIngress5xxErrors,
205+
Mode: ModeRecording,
196206
Variants: []HealthRuleVariant{
197207
{
198208
Thresholds: HealthRuleThresholds{
@@ -205,6 +215,7 @@ var (
205215
},
206216
{
207217
Template: HealthRuleIngressHTTPLatencyTrend,
218+
Mode: ModeRecording,
208219
Variants: []HealthRuleVariant{
209220
{
210221
Thresholds: HealthRuleThresholds{

docs/HealthRules.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,8 @@ The label `netobserv: "true"` is required.
146146
The annotation `netobserv_io_network_health` is optional, and gives you some control on how the alert renders in the Health page. It is a JSON string that consists in:
147147
- `namespaceLabels`: one or more labels that hold namespaces. When provided, the alert will show up under the "Namespaces" tab.
148148
- `nodeLabels`: one or more labels that hold node names. When provided, the alert will show up under the "Nodes" tab.
149-
- `ownerLabels`: one or more labels that hold owner/workload names. When provided, the alert will show up under the "Owners" tab.
149+
- `workloadLabels`: one or more labels that hold owner/workload names. When provided alongside with `kindLabels`, the alert will show up under the "Owners" tab.
150+
- `kindLabels`: one or more labels that hold owner/workload kinds. When provided alongside with `workloadLabels`, the alert will show up under the "Owners" tab.
150151
- `threshold`: the alert threshold as a string, expected to match the one defined in PromQL.
151152
- `unit`: the data unit, used only for display purpose.
152153
- `upperBound`: an upper bound value used to compute score on a closed scale. It doesn't necessarily have to be a maximum of the metric values, but metric values will be clamped if they are above the upper bound.

internal/controller/consoleplugin/config/config.go

Lines changed: 11 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -116,45 +116,18 @@ type FieldConfig struct {
116116
LokiLabel bool `yaml:"lokiLabel,omitempty" json:"lokiLabel,omitempty"`
117117
}
118118

119-
type HealthRuleLink struct {
120-
Name string `yaml:"name" json:"name"`
121-
URL string `yaml:"url" json:"url"`
122-
}
123-
124-
type HealthRuleMetadata struct {
125-
Template string `yaml:"template" json:"template"`
126-
Mode string `yaml:"mode" json:"mode"`
127-
Variants []HealthRuleVariantMetadata `yaml:"variants" json:"variants"`
128-
Description string `yaml:"description,omitempty" json:"description,omitempty"`
129-
Summary string `yaml:"summary,omitempty" json:"summary,omitempty"`
130-
Links []HealthRuleLink `yaml:"links,omitempty" json:"links,omitempty"`
131-
}
132-
133-
type HealthRuleVariantMetadata struct {
134-
GroupBy string `yaml:"groupBy,omitempty" json:"groupBy,omitempty"`
135-
LowVolumeThreshold string `yaml:"lowVolumeThreshold,omitempty" json:"lowVolumeThreshold,omitempty"`
136-
Thresholds ThresholdMetadata `yaml:"thresholds" json:"thresholds"`
137-
UpperBound string `yaml:"upperBound,omitempty" json:"upperBound,omitempty"`
138-
}
139-
140-
type ThresholdMetadata struct {
141-
Info string `yaml:"info,omitempty" json:"info,omitempty"`
142-
Warning string `yaml:"warning,omitempty" json:"warning,omitempty"`
143-
Critical string `yaml:"critical,omitempty" json:"critical,omitempty"`
144-
}
145-
146119
type FrontendConfig struct {
147-
RecordTypes []api.ConnTrackOutputRecordTypeEnum `yaml:"recordTypes" json:"recordTypes"`
148-
PortNaming flowslatest.ConsolePluginPortConfig `yaml:"portNaming,omitempty" json:"portNaming,omitempty"`
149-
Columns []ColumnConfig `yaml:"columns" json:"columns"`
150-
Filters []FilterConfig `yaml:"filters,omitempty" json:"filters,omitempty"`
151-
Scopes []ScopeConfig `yaml:"scopes" json:"scopes"`
152-
QuickFilters []flowslatest.QuickFilter `yaml:"quickFilters,omitempty" json:"quickFilters,omitempty"`
153-
AlertNamespaces []string `yaml:"alertNamespaces,omitempty" json:"alertNamespaces,omitempty"`
154-
Sampling int `yaml:"sampling" json:"sampling"`
155-
Features []string `yaml:"features" json:"features"`
156-
Fields []FieldConfig `yaml:"fields" json:"fields"`
157-
HealthRules []HealthRuleMetadata `yaml:"healthRules,omitempty" json:"healthRules,omitempty"`
120+
RecordTypes []api.ConnTrackOutputRecordTypeEnum `yaml:"recordTypes" json:"recordTypes"`
121+
PortNaming flowslatest.ConsolePluginPortConfig `yaml:"portNaming,omitempty" json:"portNaming,omitempty"`
122+
Columns []ColumnConfig `yaml:"columns" json:"columns"`
123+
Filters []FilterConfig `yaml:"filters,omitempty" json:"filters,omitempty"`
124+
Scopes []ScopeConfig `yaml:"scopes" json:"scopes"`
125+
QuickFilters []flowslatest.QuickFilter `yaml:"quickFilters,omitempty" json:"quickFilters,omitempty"`
126+
AlertNamespaces []string `yaml:"alertNamespaces,omitempty" json:"alertNamespaces,omitempty"`
127+
Sampling int `yaml:"sampling" json:"sampling"`
128+
Features []string `yaml:"features" json:"features"`
129+
Fields []FieldConfig `yaml:"fields" json:"fields"`
130+
RecordingAnnotations map[string]map[string]string `yaml:"recordingAnnotations,omitempty" json:"recordingAnnotations,omitempty"`
158131
}
159132

160133
type PluginConfig struct {

internal/controller/consoleplugin/consoleplugin_objects.go

Lines changed: 10 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@ import (
44
"context"
55
"fmt"
66
"hash/fnv"
7-
"math"
87
"path/filepath"
98
"slices"
109
"strconv"
@@ -503,76 +502,23 @@ func (b *builder) setFrontendConfig(fconf *cfg.FrontendConfig) error {
503502
}
504503

505504
// Add health rules metadata for frontend
506-
fconf.HealthRules = b.getHealthRulesMetadata()
505+
fconf.RecordingAnnotations = b.getHealthRecordingAnnotations()
507506

508507
return nil
509508
}
510509

511-
func (b *builder) getHealthRulesMetadata() []cfg.HealthRuleMetadata {
512-
var metadata []cfg.HealthRuleMetadata
513-
514-
healthRules := b.desired.GetFLPHealthRules()
515-
for _, healthRule := range healthRules {
516-
if ok, _ := healthRule.IsAllowed(b.desired); !ok {
517-
continue
518-
}
519-
520-
var variants []cfg.HealthRuleVariantMetadata
521-
for _, variant := range healthRule.Variants {
522-
// Calculate upperBound for trending alerts: max(threshold × 5, 100)
523-
// For trending rules (LatencyHighTrend, ExternalEgressHighTrend, ExternalIngressHighTrend),
524-
// we need an upper bound for score calculation
525-
upperBound := ""
526-
isTrending := healthRule.Template == flowslatest.HealthRuleLatencyHighTrend ||
527-
healthRule.Template == flowslatest.HealthRuleExternalEgressHighTrend ||
528-
healthRule.Template == flowslatest.HealthRuleExternalIngressHighTrend
529-
if isTrending {
530-
// Use the highest defined threshold (critical > warning > info)
531-
thresholdStr := variant.Thresholds.Critical
532-
if thresholdStr == "" {
533-
thresholdStr = variant.Thresholds.Warning
534-
}
535-
if thresholdStr == "" {
536-
thresholdStr = variant.Thresholds.Info
537-
}
538-
if thresholdStr != "" {
539-
if val, err := strconv.ParseFloat(thresholdStr, 64); err == nil {
540-
upperBound = strconv.Itoa(int(math.Max(val*5, 100)))
541-
}
542-
}
510+
func (b *builder) getHealthRecordingAnnotations() map[string]map[string]string {
511+
annotsPerRecording := make(map[string]map[string]string)
512+
healthRules, _ := alerts.BuildHealthRules(b.desired)
513+
for _, r := range healthRules {
514+
rname := r.RecordingName()
515+
if rname != "" {
516+
if a, _ := r.GetAnnotations(); len(a) > 0 {
517+
annotsPerRecording[rname] = a
543518
}
544-
545-
variants = append(variants, cfg.HealthRuleVariantMetadata{
546-
GroupBy: string(variant.GroupBy),
547-
LowVolumeThreshold: variant.LowVolumeThreshold,
548-
Thresholds: cfg.ThresholdMetadata{
549-
Info: variant.Thresholds.Info,
550-
Warning: variant.Thresholds.Warning,
551-
Critical: variant.Thresholds.Critical,
552-
},
553-
UpperBound: upperBound,
554-
})
555519
}
556-
557-
// Get description and summary from centralized template metadata
558-
templateInfo, ok := alerts.TemplateMetadata[healthRule.Template]
559-
description := ""
560-
summary := ""
561-
if ok {
562-
description = templateInfo.DescriptionPattern
563-
summary = templateInfo.Summary
564-
}
565-
566-
metadata = append(metadata, cfg.HealthRuleMetadata{
567-
Template: string(healthRule.Template),
568-
Mode: string(healthRule.Mode),
569-
Variants: variants,
570-
Description: description,
571-
Summary: summary,
572-
})
573520
}
574-
575-
return metadata
521+
return annotsPerRecording
576522
}
577523

578524
func getLokiStatus(lokiStack *lokiv1.LokiStack) string {

internal/controller/flp/flp_controller_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -465,7 +465,7 @@ func ControllerSpecs() {
465465
}, &pr)
466466
}, timeout, interval).Should(Succeed())
467467
Expect(pr.Spec.Groups).Should(HaveLen(1))
468-
Expect(pr.Spec.Groups[0].Rules).Should(HaveLen(12))
468+
Expect(pr.Spec.Groups[0].Rules).Should(HaveLen(9))
469469

470470
// Manually delete ServiceMonitor
471471
By("Deleting ServiceMonitor")

internal/controller/flp/flp_monolith_reconciler.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ func (r *monolithReconciler) reconcilePrometheusService(ctx context.Context, bui
194194
}
195195
}
196196
if r.ClusterInfo.HasPromRule() {
197-
rules := alerts.BuildRules(ctx, builder.desired)
197+
rules := alerts.BuildMonitoringRules(ctx, builder.desired)
198198
promRules := builder.prometheusRule(rules)
199199
if err := reconcilers.GenericReconcile(ctx, r.Managed, &r.Client, r.prometheusRule, promRules, &report, helper.PrometheusRuleChanged); err != nil {
200200
return err

internal/controller/flp/flp_test.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -570,7 +570,7 @@ func TestPrometheusRuleNoChange(t *testing.T) {
570570
ns := "namespace"
571571
cfg := getConfig()
572572
b := monoBuilder(ns, &cfg)
573-
r := alerts.BuildRules(context.Background(), &cfg)
573+
r := alerts.BuildMonitoringRules(context.Background(), &cfg)
574574
first := b.prometheusRule(r)
575575

576576
// Check no change
@@ -587,13 +587,13 @@ func TestPrometheusRuleChanged(t *testing.T) {
587587
// Get first
588588
cfg := getConfig()
589589
b := monoBuilder("namespace", &cfg)
590-
r := alerts.BuildRules(context.Background(), &cfg)
590+
r := alerts.BuildMonitoringRules(context.Background(), &cfg)
591591
first := b.prometheusRule(r)
592592

593593
// Check enabled rule change
594594
cfg.Processor.Metrics.DisableAlerts = []flowslatest.HealthRuleTemplate{flowslatest.AlertNoFlows}
595595
b = monoBuilder("namespace", &cfg)
596-
r = alerts.BuildRules(context.Background(), &cfg)
596+
r = alerts.BuildMonitoringRules(context.Background(), &cfg)
597597
second := b.prometheusRule(r)
598598

599599
report := helper.NewChangeReport("")
@@ -603,7 +603,7 @@ func TestPrometheusRuleChanged(t *testing.T) {
603603
// Check labels change
604604
info := reconcilers.Common{Namespace: "namespace2", ClusterInfo: &cluster.Info{}}
605605
b, _ = newMonolithBuilder(info.NewInstance(image2, status.Instance{}), &cfg, b.flowMetrics, nil, nil)
606-
r = alerts.BuildRules(context.Background(), &cfg)
606+
r = alerts.BuildMonitoringRules(context.Background(), &cfg)
607607
third := b.prometheusRule(r)
608608

609609
report = helper.NewChangeReport("")

internal/controller/flp/flp_transfo_reconciler.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,7 @@ func (r *transformerReconciler) reconcilePrometheusService(ctx context.Context,
201201
}
202202
}
203203
if r.ClusterInfo.HasPromRule() {
204-
rules := alerts.BuildRules(ctx, builder.desired)
204+
rules := alerts.BuildMonitoringRules(ctx, builder.desired)
205205
promRules := builder.prometheusRule(rules)
206206
if err := reconcilers.GenericReconcile(ctx, r.Managed, &r.Client, r.prometheusRule, promRules, &report, helper.PrometheusRuleChanged); err != nil {
207207
return err

0 commit comments

Comments
 (0)