diff --git a/deploy/sre-prometheus/management-cluster/100-machine-out-of-compliance.PrometheusRule.yaml b/deploy/sre-prometheus/management-cluster/100-machine-out-of-compliance.PrometheusRule.yaml index 8f9320f1de..9356ebf3ba 100644 --- a/deploy/sre-prometheus/management-cluster/100-machine-out-of-compliance.PrometheusRule.yaml +++ b/deploy/sre-prometheus/management-cluster/100-machine-out-of-compliance.PrometheusRule.yaml @@ -10,15 +10,29 @@ spec: groups: - name: sre-machine-out-of-compliance rules: + # Critical alert for machines truly stuck (>35 days = clear failure) - alert: MachineOutOfComplianceSRE # https://issues.redhat.com/browse/OSD-17905 # This alert is a fallback in case the workload in https://issues.redhat.com/browse/OSD-17902 doesn't do it's job. - expr: (time() - mapi_machine_created_timestamp_seconds) > 2419200 - for: 60m + # Fires when ANY machine exceeds 35 days old, indicating compliance-monkey failed to replace it. + expr: (time() - mapi_machine_created_timestamp_seconds) > 3024000 + for: 1h labels: severity: critical namespace: "{{ $labels.namespace }}" node: "{{ $labels.node }}" link: "https://github.com/openshift/ops-sop/blob/master/v4/alerts/hypershift/MachineOutOfCompliance.md" annotations: - message: A machine on a management cluster is older than 28 days. + message: A machine on a management cluster is older than 35 days, indicating a compliance-monkey failure. + # Warning alert for queue backlogs (multiple machines aging out simultaneously) + - alert: MachineOutOfComplianceSREWarning + # https://issues.redhat.com/browse/OSD-17905 + # Fires when multiple machines are >28 days old, indicating compliance-monkey queue backup. + # This is expected when many machines age out simultaneously but warrants monitoring. + expr: count((time() - mapi_machine_created_timestamp_seconds) > 2419200) > 5 + for: 4h + labels: + severity: warning + link: "https://github.com/openshift/ops-sop/blob/master/v4/alerts/hypershift/MachineOutOfCompliance.md" + annotations: + message: "{{ $value }} machines on a management cluster are older than 28 days, indicating a compliance-monkey queue backup." diff --git a/hack/00-osd-managed-cluster-config-integration.yaml.tmpl b/hack/00-osd-managed-cluster-config-integration.yaml.tmpl index 3461e82a4b..5fad1ed146 100644 --- a/hack/00-osd-managed-cluster-config-integration.yaml.tmpl +++ b/hack/00-osd-managed-cluster-config-integration.yaml.tmpl @@ -49076,15 +49076,26 @@ objects: - name: sre-machine-out-of-compliance rules: - alert: MachineOutOfComplianceSRE - expr: (time() - mapi_machine_created_timestamp_seconds) > 2419200 - for: 60m + expr: (time() - mapi_machine_created_timestamp_seconds) > 3024000 + for: 1h labels: severity: critical namespace: '{{ $labels.namespace }}' node: '{{ $labels.node }}' link: https://github.com/openshift/ops-sop/blob/master/v4/alerts/hypershift/MachineOutOfCompliance.md annotations: - message: A machine on a management cluster is older than 28 days. + message: A machine on a management cluster is older than 35 days, indicating + a compliance-monkey failure. + - alert: MachineOutOfComplianceSREWarning + expr: count((time() - mapi_machine_created_timestamp_seconds) > 2419200) + > 5 + for: 4h + labels: + severity: warning + link: https://github.com/openshift/ops-sop/blob/master/v4/alerts/hypershift/MachineOutOfCompliance.md + annotations: + message: '{{ $value }} machines on a management cluster are older than + 28 days, indicating a compliance-monkey queue backup.' - apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: diff --git a/hack/00-osd-managed-cluster-config-production.yaml.tmpl b/hack/00-osd-managed-cluster-config-production.yaml.tmpl index 3461e82a4b..5fad1ed146 100644 --- a/hack/00-osd-managed-cluster-config-production.yaml.tmpl +++ b/hack/00-osd-managed-cluster-config-production.yaml.tmpl @@ -49076,15 +49076,26 @@ objects: - name: sre-machine-out-of-compliance rules: - alert: MachineOutOfComplianceSRE - expr: (time() - mapi_machine_created_timestamp_seconds) > 2419200 - for: 60m + expr: (time() - mapi_machine_created_timestamp_seconds) > 3024000 + for: 1h labels: severity: critical namespace: '{{ $labels.namespace }}' node: '{{ $labels.node }}' link: https://github.com/openshift/ops-sop/blob/master/v4/alerts/hypershift/MachineOutOfCompliance.md annotations: - message: A machine on a management cluster is older than 28 days. + message: A machine on a management cluster is older than 35 days, indicating + a compliance-monkey failure. + - alert: MachineOutOfComplianceSREWarning + expr: count((time() - mapi_machine_created_timestamp_seconds) > 2419200) + > 5 + for: 4h + labels: + severity: warning + link: https://github.com/openshift/ops-sop/blob/master/v4/alerts/hypershift/MachineOutOfCompliance.md + annotations: + message: '{{ $value }} machines on a management cluster are older than + 28 days, indicating a compliance-monkey queue backup.' - apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: diff --git a/hack/00-osd-managed-cluster-config-stage.yaml.tmpl b/hack/00-osd-managed-cluster-config-stage.yaml.tmpl index 3461e82a4b..5fad1ed146 100644 --- a/hack/00-osd-managed-cluster-config-stage.yaml.tmpl +++ b/hack/00-osd-managed-cluster-config-stage.yaml.tmpl @@ -49076,15 +49076,26 @@ objects: - name: sre-machine-out-of-compliance rules: - alert: MachineOutOfComplianceSRE - expr: (time() - mapi_machine_created_timestamp_seconds) > 2419200 - for: 60m + expr: (time() - mapi_machine_created_timestamp_seconds) > 3024000 + for: 1h labels: severity: critical namespace: '{{ $labels.namespace }}' node: '{{ $labels.node }}' link: https://github.com/openshift/ops-sop/blob/master/v4/alerts/hypershift/MachineOutOfCompliance.md annotations: - message: A machine on a management cluster is older than 28 days. + message: A machine on a management cluster is older than 35 days, indicating + a compliance-monkey failure. + - alert: MachineOutOfComplianceSREWarning + expr: count((time() - mapi_machine_created_timestamp_seconds) > 2419200) + > 5 + for: 4h + labels: + severity: warning + link: https://github.com/openshift/ops-sop/blob/master/v4/alerts/hypershift/MachineOutOfCompliance.md + annotations: + message: '{{ $value }} machines on a management cluster are older than + 28 days, indicating a compliance-monkey queue backup.' - apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: