NETOBSERV-2606: increase default cacheActiveTimeout to 15s (#2412)

jotak · web-flow · commit 3bd5626a1aba · 2026-02-09T09:56:59.000+01:00
Adapt cacheMaxFlows to 120K - betting it's still sufficient to cache all
the flows, but we can increase more if we find problems in our perf
tests

Add more info in README about fine tuning.
diff --git a/README.md b/README.md
@@ -208,16 +208,18 @@ More information on Prometheus metrics is available in a dedicated page: [Metric
 
 ### Performance fine-tuning
 
-In addition to sampling and using Kafka or not, other settings can help you get an optimal setup without compromising on the observability.
+In addition to sampling and using Kafka or not, other settings can help you get an optimal setup, with or without compromising on the observability.
 
 Here is what you should pay attention to:
 
-- Resource requirements and limits (`spec.agent.ebpf.resources`, `spec.agent.processor.resources`): adapt the resource requirements and limits to the load and memory usage you expect on your cluster. The default limits (800MB) should be sufficient for most medium sized clusters. You can read more about reqs and limits [here](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/).
-
-- eBPF agent's cache max flows (`spec.agent.ebpf.cacheMaxFlows`) and timeout (`spec.agent.ebpf.cacheActiveTimeout`) control how often flows are reported by the agents. The higher are `cacheMaxFlows` and `cacheActiveTimeout`, the less traffic will be generated by the agents themselves, which also ties with less CPU load. But on the flip side, it leads to a slightly higher memory consumption, and might generate more latency in the flow collection. There is [a blog entry](https://github.com/netobserv/documents/blob/main/blogs/agent_metrics_perf/index.md) dedicated to this fine-tuning.
+- eBPF agent's cache eviction interval (`spec.agent.ebpf.cacheActiveTimeout`) controls how often flows are reported by the agents. The higher it is, the more aggregated the flows are, which results in less traffic sent by the agents themselves, and also ties with less CPU load. But on the flip side, it leads to a slightly higher memory consumption in the agent, and generates more latency in the flow collection. It must be configured in relation with the max flows parameters (`spec.agent.ebpf.cacheMaxFlows`), which defines the size of the eBPF data structures, to make sure there is always enough room for new flows. There is [a blog entry](https://netobserv.io/posts/performance-fine-tuning-a-deep-dive-in-ebpf-agent-metrics/) dedicated to this tuning.
 
 - It is possible to reduce the overall observed traffic by restricting or excluding interfaces via `spec.agent.ebpf.interfaces` and `spec.agent.ebpf.excludeInterfaces`. Note that the interface names may vary according to the CNI used.
 
+- You can also add [eBPF filters](https://netobserv.io/posts/enhancing-netobserv-by-introducing-multi-rules-flow-filtering-capability-in-ebpf/) and [flowlogs-pipeline filters](https://github.com/netobserv/flowlogs-pipeline/blob/main/docs/filtering.md) to further narrow down what's being collected, if you find that you don't need every kind of flows. The former has the greatest impact on the performance of each component, while the latter mainly improves the storage/Loki end of the pipeline.
+
+- Resource requirements and limits (`spec.agent.ebpf.resources`, `spec.agent.processor.resources`): adapt the resource requirements and limits to the load and memory usage you expect on your cluster. The default limits (800MB) should be sufficient for most medium sized clusters. You can read more about reqs and limits [here](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/).
+
 - Each component offers more advanced settings via `spec.agent.ebpf.advanced`, `spec.processor.advanced`, `spec.loki.advanced` and `spec.consolePlugin.advanced`. The agent has [environment variables](https://github.com/netobserv/netobserv-ebpf-agent/blob/main/docs/config.md) that you can set through `spec.agent.ebpf.advanced.env`.
 
 #### Loki
diff --git a/api/flowcollector/v1beta2/flowcollector_types.go b/api/flowcollector/v1beta2/flowcollector_types.go
@@ -333,18 +333,18 @@ type FlowCollectorEBPF struct {
 	//+optional
 	Sampling *int32 `json:"sampling,omitempty"`
 
-	// `cacheActiveTimeout` is the max period during which the reporter aggregates flows before sending.
+	// `cacheActiveTimeout` is the period during which the agent aggregates flows before sending.
 	// Increasing `cacheMaxFlows` and `cacheActiveTimeout` can decrease the network traffic overhead and the CPU load,
 	// however you can expect higher memory consumption and an increased latency in the flow collection.
 	//+kubebuilder:validation:Pattern:=^\d+(ns|ms|s|m)?$
-	//+kubebuilder:default:="5s"
+	//+kubebuilder:default:="15s"
 	CacheActiveTimeout string `json:"cacheActiveTimeout,omitempty"`
 
-	// `cacheMaxFlows` is the max number of flows in an aggregate; when reached, the reporter sends the flows.
+	// `cacheMaxFlows` is the maximum number of flows in an aggregate; when reached, the reporter sends the flows.
 	// Increasing `cacheMaxFlows` and `cacheActiveTimeout` can decrease the network traffic overhead and the CPU load,
 	// however you can expect higher memory consumption and an increased latency in the flow collection.
 	//+kubebuilder:validation:Minimum=1
-	//+kubebuilder:default:=100000
+	//+kubebuilder:default:=120000
 	CacheMaxFlows int32 `json:"cacheMaxFlows,omitempty"`
 
 	// `interfaces` contains the interface names from where flows are collected. If empty, the agent
diff --git a/bundle/manifests/flows.netobserv.io_flowcollectors.yaml b/bundle/manifests/flows.netobserv.io_flowcollectors.yaml
@@ -1098,17 +1098,17 @@ spec:
                             type: object
                         type: object
                       cacheActiveTimeout:
-                        default: 5s
+                        default: 15s
                         description: |-
-                          `cacheActiveTimeout` is the max period during which the reporter aggregates flows before sending.
+                          `cacheActiveTimeout` is the period during which the agent aggregates flows before sending.
                           Increasing `cacheMaxFlows` and `cacheActiveTimeout` can decrease the network traffic overhead and the CPU load,
                           however you can expect higher memory consumption and an increased latency in the flow collection.
                         pattern: ^\d+(ns|ms|s|m)?$
                         type: string
                       cacheMaxFlows:
-                        default: 100000
+                        default: 120000
                         description: |-
-                          `cacheMaxFlows` is the max number of flows in an aggregate; when reached, the reporter sends the flows.
+                          `cacheMaxFlows` is the maximum number of flows in an aggregate; when reached, the reporter sends the flows.
                           Increasing `cacheMaxFlows` and `cacheActiveTimeout` can decrease the network traffic overhead and the CPU load,
                           however you can expect higher memory consumption and an increased latency in the flow collection.
                         format: int32
diff --git a/bundle/manifests/netobserv-operator.clusterserviceversion.yaml b/bundle/manifests/netobserv-operator.clusterserviceversion.yaml
@@ -39,8 +39,8 @@ metadata:
           "spec": {
             "agent": {
               "ebpf": {
-                "cacheActiveTimeout": "5s",
-                "cacheMaxFlows": 100000,
+                "cacheActiveTimeout": "15s",
+                "cacheMaxFlows": 120000,
                 "excludeInterfaces": [
                   "lo"
                 ],
diff --git a/config/crd/bases/flows.netobserv.io_flowcollectors.yaml b/config/crd/bases/flows.netobserv.io_flowcollectors.yaml
@@ -1024,17 +1024,17 @@ spec:
                               type: object
                           type: object
                         cacheActiveTimeout:
-                          default: 5s
+                          default: 15s
                           description: |-
-                            `cacheActiveTimeout` is the max period during which the reporter aggregates flows before sending.
+                            `cacheActiveTimeout` is the period during which the agent aggregates flows before sending.
                             Increasing `cacheMaxFlows` and `cacheActiveTimeout` can decrease the network traffic overhead and the CPU load,
                             however you can expect higher memory consumption and an increased latency in the flow collection.
                           pattern: ^\d+(ns|ms|s|m)?$
                           type: string
                         cacheMaxFlows:
-                          default: 100000
+                          default: 120000
                           description: |-
-                            `cacheMaxFlows` is the max number of flows in an aggregate; when reached, the reporter sends the flows.
+                            `cacheMaxFlows` is the maximum number of flows in an aggregate; when reached, the reporter sends the flows.
                             Increasing `cacheMaxFlows` and `cacheActiveTimeout` can decrease the network traffic overhead and the CPU load,
                             however you can expect higher memory consumption and an increased latency in the flow collection.
                           format: int32
diff --git a/config/samples/flows_v1beta2_flowcollector.yaml b/config/samples/flows_v1beta2_flowcollector.yaml
@@ -11,11 +11,11 @@ spec:
   agent:
     type: eBPF
     ebpf:
-      # imagePullPolicy: IfNotPresent
-      # logLevel: info
+      # imagePullPolicy: Always
+      # logLevel: debug
       sampling: 50
-      cacheActiveTimeout: 5s
-      cacheMaxFlows: 100000
+      cacheActiveTimeout: 15s
+      cacheMaxFlows: 120000
       # Change privileged to "true" on old kernel version not knowing CAP_BPF or when using "PacketDrop" feature
       privileged: false
       # features:
@@ -77,8 +77,8 @@ spec:
   #       certFile: user.crt
   #       certKey: user.key
   processor:
-    # imagePullPolicy: IfNotPresent
-    # logLevel: info
+    # imagePullPolicy: Always
+    # logLevel: debug
     # Change logTypes to "Conversations", "EndedConversations" or "All" to enable conversation tracking
     # logTypes: Flows
     # Append a unique cluster name to each record
@@ -182,8 +182,8 @@ spec:
   #     timeout: 30s
   consolePlugin:
     enable: true
-    # imagePullPolicy: IfNotPresent
-    # logLevel: info
+    # imagePullPolicy: Always
+    # logLevel: debug
     # Scaling configuration
     # replicas: 1
     # autoscaler:
diff --git a/docs/FlowCollector.md b/docs/FlowCollector.md
@@ -256,23 +256,23 @@ override the default Linux capabilities from there.<br/>
         <td><b>cacheActiveTimeout</b></td>
         <td>string</td>
         <td>
-          `cacheActiveTimeout` is the max period during which the reporter aggregates flows before sending.
+          `cacheActiveTimeout` is the period during which the agent aggregates flows before sending.
 Increasing `cacheMaxFlows` and `cacheActiveTimeout` can decrease the network traffic overhead and the CPU load,
 however you can expect higher memory consumption and an increased latency in the flow collection.<br/>
           <br/>
-            <i>Default</i>: 5s<br/>
+            <i>Default</i>: 15s<br/>
         </td>
         <td>false</td>
       </tr><tr>
         <td><b>cacheMaxFlows</b></td>
         <td>integer</td>
         <td>
-          `cacheMaxFlows` is the max number of flows in an aggregate; when reached, the reporter sends the flows.
+          `cacheMaxFlows` is the maximum number of flows in an aggregate; when reached, the reporter sends the flows.
 Increasing `cacheMaxFlows` and `cacheActiveTimeout` can decrease the network traffic overhead and the CPU load,
 however you can expect higher memory consumption and an increased latency in the flow collection.<br/>
           <br/>
             <i>Format</i>: int32<br/>
-            <i>Default</i>: 100000<br/>
+            <i>Default</i>: 120000<br/>
             <i>Minimum</i>: 1<br/>
         </td>
         <td>false</td>
diff --git a/helm/crds/flows.netobserv.io_flowcollectors.yaml b/helm/crds/flows.netobserv.io_flowcollectors.yaml
@@ -1028,17 +1028,17 @@ spec:
                               type: object
                           type: object
                         cacheActiveTimeout:
-                          default: 5s
+                          default: 15s
                           description: |-
-                            `cacheActiveTimeout` is the max period during which the reporter aggregates flows before sending.
+                            `cacheActiveTimeout` is the period during which the agent aggregates flows before sending.
                             Increasing `cacheMaxFlows` and `cacheActiveTimeout` can decrease the network traffic overhead and the CPU load,
                             however you can expect higher memory consumption and an increased latency in the flow collection.
                           pattern: ^\d+(ns|ms|s|m)?$
                           type: string
                         cacheMaxFlows:
-                          default: 100000
+                          default: 120000
                           description: |-
-                            `cacheMaxFlows` is the max number of flows in an aggregate; when reached, the reporter sends the flows.
+                            `cacheMaxFlows` is the maximum number of flows in an aggregate; when reached, the reporter sends the flows.
                             Increasing `cacheMaxFlows` and `cacheActiveTimeout` can decrease the network traffic overhead and the CPU load,
                             however you can expect higher memory consumption and an increased latency in the flow collection.
                           format: int32
diff --git a/internal/controller/flowcollector_controller_ebpf_test.go b/internal/controller/flowcollector_controller_ebpf_test.go
@@ -57,7 +57,7 @@ func flowCollectorEBPFSpecs() {
 						Type: "eBPF",
 						EBPF: flowslatest.FlowCollectorEBPF{
 							Sampling:           ptr.To(int32(123)),
-							CacheActiveTimeout: "15s",
+							CacheActiveTimeout: "1s",
 							CacheMaxFlows:      100,
 							Interfaces:         []string{"veth0", "/^br-/"},
 							ExcludeInterfaces:  []string{"br-3", "lo"},
@@ -96,7 +96,7 @@ func flowCollectorEBPFSpecs() {
 			Expect(*spec.Containers[0].SecurityContext.RunAsUser).To(Equal(int64(0)))
 			Expect(spec.Containers[0].Env).To(ContainElements(
 				v1.EnvVar{Name: "EXPORT", Value: "grpc"},
-				v1.EnvVar{Name: "CACHE_ACTIVE_TIMEOUT", Value: "15s"},
+				v1.EnvVar{Name: "CACHE_ACTIVE_TIMEOUT", Value: "1s"},
 				v1.EnvVar{Name: "CACHE_MAX_FLOWS", Value: "100"},
 				v1.EnvVar{Name: "LOG_LEVEL", Value: "trace"},
 				v1.EnvVar{Name: "INTERFACES", Value: "veth0,/^br-/"},
diff --git a/internal/controller/flowcollector_controller_iso_test.go b/internal/controller/flowcollector_controller_iso_test.go
@@ -110,7 +110,7 @@ func flowCollectorIsoSpecs() {
 				},
 				EBPF: flowslatest.FlowCollectorEBPF{
 					Sampling:           &zero,
-					CacheActiveTimeout: "5s",
+					CacheActiveTimeout: "15s",
 					CacheMaxFlows:      100,
 					ImagePullPolicy:    "Always",
 					Advanced:           &flowslatest.AdvancedAgentConfig{},