diff --git a/environments/dev/adibrov/alerts/test/test_cpu_firing.yaml b/environments/dev/adibrov/alerts/test/test_cpu_firing.yaml new file mode 100644 index 0000000..e60df86 --- /dev/null +++ b/environments/dev/adibrov/alerts/test/test_cpu_firing.yaml @@ -0,0 +1,21 @@ +name: "DEV ADIBROV - TEST CPU > 2%" +expression: | + ( + 1 - avg by(instance) (rate(node_cpu_seconds_total{job="node_exporter", mode="idle"}[1m])) + ) * 100 +threshold: 2 +for: "1m" +condition_type: "gt" +need_reduce: true +reducer_type: "max" +no_data_state: "OK" +exec_err_state: "OK" +labels: + service: "test" + severity: "info" + status: "test" + team: "infra" +summary: | + [TEST] CPU {{ printf "%.1f" $values.B.Value }}% на {{ $labels.instance }} +description: | + Тестовый алерт — срабатывает при CPU > 2%. Используется для проверки доставки уведомлений. diff --git a/environments/dev/adibrov/alerts/test/test_goroutines_spike.yaml b/environments/dev/adibrov/alerts/test/test_goroutines_spike.yaml new file mode 100644 index 0000000..e136f89 --- /dev/null +++ b/environments/dev/adibrov/alerts/test/test_goroutines_spike.yaml @@ -0,0 +1,19 @@ +name: "DEV ADIBROV - TEST Go Goroutines" +expression: | + go_goroutines{job="prometheus"} +threshold: 10 +for: "1m" +condition_type: "gt" +need_reduce: true +reducer_type: "max" +no_data_state: "OK" +exec_err_state: "OK" +labels: + service: "test" + severity: "info" + status: "test" + team: "infra" +summary: | + [TEST] Goroutines prometheus: {{ printf "%.0f" $values.B.Value }} +description: | + Тестовый алерт — горутин в prometheus больше 10 (норма ~100+). Всегда файрится, используется для проверки resolve. diff --git a/environments/dev/adibrov/alerts/test/test_memory_firing.yaml b/environments/dev/adibrov/alerts/test/test_memory_firing.yaml new file mode 100644 index 0000000..b16d34a --- /dev/null +++ b/environments/dev/adibrov/alerts/test/test_memory_firing.yaml @@ -0,0 +1,21 @@ +name: "DEV ADIBROV - TEST Memory > 20%" +expression: | + ( + 1 - (node_memory_MemAvailable_bytes{job="node_exporter"} / node_memory_MemTotal_bytes{job="node_exporter"}) + ) * 100 +threshold: 20 +for: "1m" +condition_type: "gt" +need_reduce: true +reducer_type: "max" +no_data_state: "OK" +exec_err_state: "OK" +labels: + service: "test" + severity: "info" + status: "test" + team: "infra" +summary: | + [TEST] Память {{ printf "%.1f" $values.B.Value }}% на {{ $labels.instance }} +description: | + Тестовый алерт — срабатывает при использовании памяти > 20%. Используется для проверки доставки уведомлений. diff --git a/environments/dev/adibrov/terraform.tfvars b/environments/dev/adibrov/terraform.tfvars index 185edca..08fe7cd 100644 --- a/environments/dev/adibrov/terraform.tfvars +++ b/environments/dev/adibrov/terraform.tfvars @@ -109,6 +109,15 @@ groups = [ keep_manual_changes = false prevent_destroy_on_recreate = false alerts_on_datasources_uid = ["prometheus"] + }, + { + dashboard_alert_group_name = "Test Alerts" + folder_uid = "test-alerts" + alert_definitions_path = "alerts/test" + dashboard_path_if_exist = null + keep_manual_changes = false + prevent_destroy_on_recreate = false + alerts_on_datasources_uid = ["prometheus"] } ] # Data sources configuration