Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 15 additions & 4 deletions api/flowcollector/v1beta2/flowcollector_defaults.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"time"

v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/utils/ptr"
)

var (
Expand Down Expand Up @@ -48,6 +49,7 @@ var (
DefaultHealthRules = []FLPHealthRule{
{
Template: HealthRulePacketDropsByKernel,
Mode: ModeRecording,
Variants: []HealthRuleVariant{
{
Thresholds: HealthRuleThresholds{
Expand All @@ -58,9 +60,10 @@ var (
GroupBy: GroupByNamespace,
},
{
Mode: ptr.To(ModeRecording),
Thresholds: HealthRuleThresholds{
Info: "5",
Warning: "10",
Warning: "15",
},
GroupBy: GroupByNode,
},
Expand All @@ -71,7 +74,8 @@ var (
Variants: []HealthRuleVariant{
{
Thresholds: HealthRuleThresholds{
Warning: "5",
Info: "5",
Warning: "10",
},
GroupBy: GroupByNode,
},
Expand All @@ -82,12 +86,12 @@ var (
Variants: []HealthRuleVariant{
{
Thresholds: HealthRuleThresholds{
Critical: "2",
Warning: "2",
},
},
{
Thresholds: HealthRuleThresholds{
Critical: "2",
Warning: "2",
},
GroupBy: GroupByNode,
},
Expand All @@ -112,6 +116,7 @@ var (
},
{
Template: HealthRuleDNSNxDomain,
Mode: ModeRecording,
Variants: []HealthRuleVariant{
{
Thresholds: HealthRuleThresholds{
Expand All @@ -124,6 +129,7 @@ var (
},
{
Template: HealthRuleNetpolDenied,
Mode: ModeRecording,
Variants: []HealthRuleVariant{
{
Thresholds: HealthRuleThresholds{
Expand All @@ -136,6 +142,7 @@ var (
},
{
Template: HealthRuleLatencyHighTrend,
Mode: ModeRecording,
Variants: []HealthRuleVariant{
{
Thresholds: HealthRuleThresholds{
Expand All @@ -149,6 +156,7 @@ var (
},
{
Template: HealthRuleExternalEgressHighTrend,
Mode: ModeRecording,
Variants: []HealthRuleVariant{
{
Thresholds: HealthRuleThresholds{
Expand All @@ -171,6 +179,7 @@ var (
},
{
Template: HealthRuleExternalIngressHighTrend,
Mode: ModeRecording,
Variants: []HealthRuleVariant{
{
Thresholds: HealthRuleThresholds{
Expand All @@ -193,6 +202,7 @@ var (
},
{
Template: HealthRuleIngress5xxErrors,
Mode: ModeRecording,
Variants: []HealthRuleVariant{
{
Thresholds: HealthRuleThresholds{
Expand All @@ -205,6 +215,7 @@ var (
},
{
Template: HealthRuleIngressHTTPLatencyTrend,
Mode: ModeRecording,
Variants: []HealthRuleVariant{
{
Thresholds: HealthRuleThresholds{
Expand Down
3 changes: 2 additions & 1 deletion docs/HealthRules.md
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,8 @@ The label `netobserv: "true"` is required.
The annotation `netobserv_io_network_health` is optional, and gives you some control on how the alert renders in the Health page. It is a JSON string that consists in:
- `namespaceLabels`: one or more labels that hold namespaces. When provided, the alert will show up under the "Namespaces" tab.
- `nodeLabels`: one or more labels that hold node names. When provided, the alert will show up under the "Nodes" tab.
- `ownerLabels`: one or more labels that hold owner/workload names. When provided, the alert will show up under the "Owners" tab.
- `workloadLabels`: one or more labels that hold owner/workload names. When provided alongside with `kindLabels`, the alert will show up under the "Owners" tab.
- `kindLabels`: one or more labels that hold owner/workload kinds. When provided alongside with `workloadLabels`, the alert will show up under the "Owners" tab.
- `threshold`: the alert threshold as a string, expected to match the one defined in PromQL.
- `unit`: the data unit, used only for display purpose.
- `upperBound`: an upper bound value used to compute score on a closed scale. It doesn't necessarily have to be a maximum of the metric values, but metric values will be clamped if they are above the upper bound.
Expand Down
49 changes: 11 additions & 38 deletions internal/controller/consoleplugin/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,45 +115,18 @@ type FieldConfig struct {
LokiLabel bool `yaml:"lokiLabel,omitempty" json:"lokiLabel,omitempty"`
}

type HealthRuleLink struct {
Name string `yaml:"name" json:"name"`
URL string `yaml:"url" json:"url"`
}

type HealthRuleMetadata struct {
Template string `yaml:"template" json:"template"`
Mode string `yaml:"mode" json:"mode"`
Variants []HealthRuleVariantMetadata `yaml:"variants" json:"variants"`
Description string `yaml:"description,omitempty" json:"description,omitempty"`
Summary string `yaml:"summary,omitempty" json:"summary,omitempty"`
Links []HealthRuleLink `yaml:"links,omitempty" json:"links,omitempty"`
}

type HealthRuleVariantMetadata struct {
GroupBy string `yaml:"groupBy,omitempty" json:"groupBy,omitempty"`
LowVolumeThreshold string `yaml:"lowVolumeThreshold,omitempty" json:"lowVolumeThreshold,omitempty"`
Thresholds ThresholdMetadata `yaml:"thresholds" json:"thresholds"`
UpperBound string `yaml:"upperBound,omitempty" json:"upperBound,omitempty"`
}

type ThresholdMetadata struct {
Info string `yaml:"info,omitempty" json:"info,omitempty"`
Warning string `yaml:"warning,omitempty" json:"warning,omitempty"`
Critical string `yaml:"critical,omitempty" json:"critical,omitempty"`
}

type FrontendConfig struct {
RecordTypes []api.ConnTrackOutputRecordTypeEnum `yaml:"recordTypes" json:"recordTypes"`
PortNaming flowslatest.ConsolePluginPortConfig `yaml:"portNaming,omitempty" json:"portNaming,omitempty"`
Columns []ColumnConfig `yaml:"columns" json:"columns"`
Filters []FilterConfig `yaml:"filters,omitempty" json:"filters,omitempty"`
Scopes []ScopeConfig `yaml:"scopes" json:"scopes"`
QuickFilters []flowslatest.QuickFilter `yaml:"quickFilters,omitempty" json:"quickFilters,omitempty"`
AlertNamespaces []string `yaml:"alertNamespaces,omitempty" json:"alertNamespaces,omitempty"`
Sampling int `yaml:"sampling" json:"sampling"`
Features []string `yaml:"features" json:"features"`
Fields []FieldConfig `yaml:"fields" json:"fields"`
HealthRules []HealthRuleMetadata `yaml:"healthRules,omitempty" json:"healthRules,omitempty"`
RecordTypes []api.ConnTrackOutputRecordTypeEnum `yaml:"recordTypes" json:"recordTypes"`
PortNaming flowslatest.ConsolePluginPortConfig `yaml:"portNaming,omitempty" json:"portNaming,omitempty"`
Columns []ColumnConfig `yaml:"columns" json:"columns"`
Filters []FilterConfig `yaml:"filters,omitempty" json:"filters,omitempty"`
Scopes []ScopeConfig `yaml:"scopes" json:"scopes"`
QuickFilters []flowslatest.QuickFilter `yaml:"quickFilters,omitempty" json:"quickFilters,omitempty"`
AlertNamespaces []string `yaml:"alertNamespaces,omitempty" json:"alertNamespaces,omitempty"`
Sampling int `yaml:"sampling" json:"sampling"`
Features []string `yaml:"features" json:"features"`
Fields []FieldConfig `yaml:"fields" json:"fields"`
RecordingAnnotations map[string]map[string]string `yaml:"recordingAnnotations,omitempty" json:"recordingAnnotations,omitempty"`
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Question although I think I know the answer, the change here is to just send annotations vs healthrules, right?.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, it's to 100% fill the gap between alerts and recording, since the console plugin was able to get annotations on alerts but not on recordings

}

type PluginConfig struct {
Expand Down
74 changes: 10 additions & 64 deletions internal/controller/consoleplugin/consoleplugin_objects.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ import (
"context"
"fmt"
"hash/fnv"
"math"
"path/filepath"
"slices"
"strconv"
Expand Down Expand Up @@ -502,76 +501,23 @@ func (b *builder) setFrontendConfig(fconf *cfg.FrontendConfig) error {
}

// Add health rules metadata for frontend
fconf.HealthRules = b.getHealthRulesMetadata()
fconf.RecordingAnnotations = b.getHealthRecordingAnnotations()

return nil
}

func (b *builder) getHealthRulesMetadata() []cfg.HealthRuleMetadata {
var metadata []cfg.HealthRuleMetadata

healthRules := b.desired.GetFLPHealthRules()
for _, healthRule := range healthRules {
if ok, _ := healthRule.IsAllowed(b.desired); !ok {
continue
}

var variants []cfg.HealthRuleVariantMetadata
for _, variant := range healthRule.Variants {
// Calculate upperBound for trending alerts: max(threshold × 5, 100)
// For trending rules (LatencyHighTrend, ExternalEgressHighTrend, ExternalIngressHighTrend),
// we need an upper bound for score calculation
upperBound := ""
isTrending := healthRule.Template == flowslatest.HealthRuleLatencyHighTrend ||
healthRule.Template == flowslatest.HealthRuleExternalEgressHighTrend ||
healthRule.Template == flowslatest.HealthRuleExternalIngressHighTrend
if isTrending {
// Use the highest defined threshold (critical > warning > info)
thresholdStr := variant.Thresholds.Critical
if thresholdStr == "" {
thresholdStr = variant.Thresholds.Warning
}
if thresholdStr == "" {
thresholdStr = variant.Thresholds.Info
}
if thresholdStr != "" {
if val, err := strconv.ParseFloat(thresholdStr, 64); err == nil {
upperBound = strconv.Itoa(int(math.Max(val*5, 100)))
}
}
func (b *builder) getHealthRecordingAnnotations() map[string]map[string]string {
annotsPerRecording := make(map[string]map[string]string)
healthRules, _ := alerts.BuildHealthRules(b.desired)
for _, r := range healthRules {
rname := r.RecordingName()
if rname != "" {
if a, _ := r.GetAnnotations(); len(a) > 0 {
annotsPerRecording[rname] = a
}

variants = append(variants, cfg.HealthRuleVariantMetadata{
GroupBy: string(variant.GroupBy),
LowVolumeThreshold: variant.LowVolumeThreshold,
Thresholds: cfg.ThresholdMetadata{
Info: variant.Thresholds.Info,
Warning: variant.Thresholds.Warning,
Critical: variant.Thresholds.Critical,
},
UpperBound: upperBound,
})
}

// Get description and summary from centralized template metadata
templateInfo, ok := alerts.TemplateMetadata[healthRule.Template]
description := ""
summary := ""
if ok {
description = templateInfo.DescriptionPattern
summary = templateInfo.Summary
}

metadata = append(metadata, cfg.HealthRuleMetadata{
Template: string(healthRule.Template),
Mode: string(healthRule.Mode),
Variants: variants,
Description: description,
Summary: summary,
})
}

return metadata
return annotsPerRecording
}

// returns a configmap with a digest of its configuration contents, which will be used to
Expand Down
2 changes: 1 addition & 1 deletion internal/controller/flp/flp_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -465,7 +465,7 @@ func ControllerSpecs() {
}, &pr)
}, timeout, interval).Should(Succeed())
Expect(pr.Spec.Groups).Should(HaveLen(1))
Expect(pr.Spec.Groups[0].Rules).Should(HaveLen(12))
Expect(pr.Spec.Groups[0].Rules).Should(HaveLen(9))

// Manually delete ServiceMonitor
By("Deleting ServiceMonitor")
Expand Down
2 changes: 1 addition & 1 deletion internal/controller/flp/flp_monolith_reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ func (r *monolithReconciler) reconcilePrometheusService(ctx context.Context, bui
}
}
if r.ClusterInfo.HasPromRule() {
rules := alerts.BuildRules(ctx, builder.desired)
rules := alerts.BuildMonitoringRules(ctx, builder.desired)
promRules := builder.prometheusRule(rules)
if err := reconcilers.GenericReconcile(ctx, r.Managed, &r.Client, r.prometheusRule, promRules, &report, helper.PrometheusRuleChanged); err != nil {
return err
Expand Down
8 changes: 4 additions & 4 deletions internal/controller/flp/flp_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -570,7 +570,7 @@ func TestPrometheusRuleNoChange(t *testing.T) {
ns := "namespace"
cfg := getConfig()
b := monoBuilder(ns, &cfg)
r := alerts.BuildRules(context.Background(), &cfg)
r := alerts.BuildMonitoringRules(context.Background(), &cfg)
first := b.prometheusRule(r)

// Check no change
Expand All @@ -587,13 +587,13 @@ func TestPrometheusRuleChanged(t *testing.T) {
// Get first
cfg := getConfig()
b := monoBuilder("namespace", &cfg)
r := alerts.BuildRules(context.Background(), &cfg)
r := alerts.BuildMonitoringRules(context.Background(), &cfg)
first := b.prometheusRule(r)

// Check enabled rule change
cfg.Processor.Metrics.DisableAlerts = []flowslatest.HealthRuleTemplate{flowslatest.AlertNoFlows}
b = monoBuilder("namespace", &cfg)
r = alerts.BuildRules(context.Background(), &cfg)
r = alerts.BuildMonitoringRules(context.Background(), &cfg)
second := b.prometheusRule(r)

report := helper.NewChangeReport("")
Expand All @@ -603,7 +603,7 @@ func TestPrometheusRuleChanged(t *testing.T) {
// Check labels change
info := reconcilers.Common{Namespace: "namespace2", ClusterInfo: &cluster.Info{}}
b, _ = newMonolithBuilder(info.NewInstance(image2, status.Instance{}), &cfg, b.flowMetrics, nil, nil)
r = alerts.BuildRules(context.Background(), &cfg)
r = alerts.BuildMonitoringRules(context.Background(), &cfg)
third := b.prometheusRule(r)

report = helper.NewChangeReport("")
Expand Down
2 changes: 1 addition & 1 deletion internal/controller/flp/flp_transfo_reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ func (r *transformerReconciler) reconcilePrometheusService(ctx context.Context,
}
}
if r.ClusterInfo.HasPromRule() {
rules := alerts.BuildRules(ctx, builder.desired)
rules := alerts.BuildMonitoringRules(ctx, builder.desired)
promRules := builder.prometheusRule(rules)
if err := reconcilers.GenericReconcile(ctx, r.Managed, &r.Client, r.prometheusRule, promRules, &report, helper.PrometheusRuleChanged); err != nil {
return err
Expand Down
Loading