From 731d0cfc8e22e7853d8f1bd0ac0b31454791a4c5 Mon Sep 17 00:00:00 2001 From: Philip Potter Date: Wed, 9 Oct 2019 10:09:18 +0100 Subject: [PATCH] Fix TargetDown alert text Currently, when TargetDown fires, we get messages like > 0% of the envoy-stats targets are down. which is obviously wrong. The fix is to change the value of the metric. Rather than averaging `up` to get a percentage of instances that are up, we average `1-up` to get a percentage of instances that are down. --- .../charts/gsp-monitoring/templates/rules-general.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charts/gsp-cluster/charts/gsp-monitoring/templates/rules-general.yaml b/charts/gsp-cluster/charts/gsp-monitoring/templates/rules-general.yaml index dd5717f45..3247bb8e1 100644 --- a/charts/gsp-cluster/charts/gsp-monitoring/templates/rules-general.yaml +++ b/charts/gsp-cluster/charts/gsp-monitoring/templates/rules-general.yaml @@ -9,7 +9,7 @@ spec: - alert: TargetDown annotations: message: '{{`{{ $value }}`}}% of the {{`{{ $labels.job }}`}} targets are down.' - expr: 100 * (avg without(instance, pod) (up)) < 90 + expr: 100 * (avg without(instance, pod) (1 - up)) > 10 for: 10m labels: severity: warning