feat: add postgres/gitea/blackbox alerts and more node alerts

2026-04-03 11:34:08 +03:00
parent 03dfa99400
commit 5af763ebb1
18 changed files with 393 additions and 1 deletions
--- a/environments/dev/adibrov/alerts/blackbox/endpoint_down.yaml
+++ b/environments/dev/adibrov/alerts/blackbox/endpoint_down.yaml
@ -0,0 +1,19 @@
+name: "DEV ADIBROV - Endpoint Down (Blackbox)"
+expression: |
+  probe_success{job="blackbox"}
+threshold: 1
+for: "3m"
+condition_type: "lt"
+need_reduce: true
+reducer_type: "min"
+no_data_state: "Alerting"
+exec_err_state: "Alerting"
+labels:
+  service: "blackbox"
+  severity: "critical"
+  team: "infra"
+summary: |
+  Endpoint недоступен: {{ $labels.instance }}
+description: |
+  Blackbox exporter не может достучаться до {{ $labels.instance }}.
+  Сервис недоступен снаружи уже более 3 минут.
--- a/environments/dev/adibrov/alerts/blackbox/ssl_cert_expiry.yaml
+++ b/environments/dev/adibrov/alerts/blackbox/ssl_cert_expiry.yaml
@ -0,0 +1,19 @@
+name: "DEV ADIBROV - SSL Certificate Expiring Soon"
+expression: |
+  (probe_ssl_earliest_cert_expiry{job="blackbox"} - time()) / 86400
+threshold: 14
+for: "1h"
+condition_type: "lt"
+need_reduce: true
+reducer_type: "min"
+no_data_state: "OK"
+exec_err_state: "Error"
+labels:
+  service: "blackbox"
+  severity: "warning"
+  team: "infra"
+summary: |
+  SSL сертификат истекает через {{ printf "%.0f" $values.B.Value }} дней: {{ $labels.instance }}
+description: |
+  SSL сертификат для {{ $labels.instance }} истекает менее чем через 14 дней.
+  Необходимо обновить сертификат до истечения срока действия.
--- a/environments/dev/adibrov/alerts/containers/container_high_memory.yaml
+++ b/environments/dev/adibrov/alerts/containers/container_high_memory.yaml
@ -0,0 +1,22 @@
+name: "DEV ADIBROV - Container High Memory Usage"
+expression: |
+  (
+    container_memory_working_set_bytes{job="cadvisor", name!="", container!=""}
+    / container_spec_memory_limit_bytes{job="cadvisor", name!="", container!=""}
+  ) * 100
+threshold: 90
+for: "5m"
+condition_type: "gt"
+need_reduce: true
+reducer_type: "max"
+no_data_state: "OK"
+exec_err_state: "Error"
+labels:
+  service: "containers"
+  severity: "warning"
+  team: "infra"
+summary: |
+  Контейнер {{ $labels.name }} использует {{ printf "%.0f" $values.B.Value }}% лимита памяти
+description: |
+  Контейнер {{ $labels.name }} на {{ $labels.instance }} использует {{ printf "%.0f" $values.B.Value }}% от лимита памяти.
+  При достижении 100% контейнер будет убит OOM killer.
--- a/environments/dev/adibrov/alerts/containers/container_oom.yaml
+++ b/environments/dev/adibrov/alerts/containers/container_oom.yaml
@ -0,0 +1,19 @@
+name: "DEV ADIBROV - Container OOM Killed"
+expression: |
+  increase(container_oom_events_total{job="cadvisor", name!=""}[5m])
+threshold: 0
+for: "1m"
+condition_type: "gt"
+need_reduce: true
+reducer_type: "sum"
+no_data_state: "OK"
+exec_err_state: "Error"
+labels:
+  service: "containers"
+  severity: "critical"
+  team: "infra"
+summary: |
+  Контейнер {{ $labels.name }} убит OOM killer на {{ $labels.instance }}
+description: |
+  Контейнер {{ $labels.name }} на {{ $labels.instance }} был убит ядром из-за нехватки памяти.
+  Нужно проверить лимиты памяти контейнера и текущее потребление.
--- a/environments/dev/adibrov/alerts/gitea/gitea_down.yaml
+++ b/environments/dev/adibrov/alerts/gitea/gitea_down.yaml
@ -0,0 +1,19 @@
+name: "DEV ADIBROV - Gitea Down"
+expression: |
+  gitea_build_info{job="gitea"}
+threshold: 1
+for: "2m"
+condition_type: "lt"
+need_reduce: true
+reducer_type: "min"
+no_data_state: "Alerting"
+exec_err_state: "Alerting"
+labels:
+  service: "gitea"
+  severity: "critical"
+  team: "infra"
+summary: |
+  Gitea недоступна на {{ $labels.instance }}
+description: |
+  Метрики Gitea не поступают с {{ $labels.instance }}.
+  Сервис Gitea либо упал, либо недоступен scrape endpoint.
--- a/environments/dev/adibrov/alerts/gitea/gitea_open_issues_spike.yaml
+++ b/environments/dev/adibrov/alerts/gitea/gitea_open_issues_spike.yaml
@ -0,0 +1,19 @@
+name: "DEV ADIBROV - Gitea Open Issues Spike"
+expression: |
+  increase(gitea_issues_open{job="gitea"}[1h])
+threshold: 50
+for: "5m"
+condition_type: "gt"
+need_reduce: true
+reducer_type: "max"
+no_data_state: "OK"
+exec_err_state: "Error"
+labels:
+  service: "gitea"
+  severity: "warning"
+  team: "infra"
+summary: |
+  Всплеск открытых issue в Gitea: +{{ printf "%.0f" $values.B.Value }} за час
+description: |
+  За последний час количество открытых issues в Gitea выросло на {{ printf "%.0f" $values.B.Value }}.
+  Возможна массовая автоматическая генерация задач или проблема с интеграцией.
--- a/environments/dev/adibrov/alerts/nginx/nginx_down.yaml
+++ b/environments/dev/adibrov/alerts/nginx/nginx_down.yaml
@ -0,0 +1,19 @@
+name: "DEV ADIBROV - Nginx Down"
+expression: |
+  nginx_up{job="nginx"}
+threshold: 1
+for: "2m"
+condition_type: "lt"
+need_reduce: true
+reducer_type: "min"
+no_data_state: "Alerting"
+exec_err_state: "Alerting"
+labels:
+  service: "nginx"
+  severity: "critical"
+  team: "infra"
+summary: |
+  Nginx недоступен на {{ $labels.instance }}
+description: |
+  Nginx exporter не может подключиться к nginx на {{ $labels.instance }}.
+  Nginx либо упал, либо недоступен его status page.
--- a/environments/dev/adibrov/alerts/nginx/nginx_high_connections.yaml
+++ b/environments/dev/adibrov/alerts/nginx/nginx_high_connections.yaml
@ -0,0 +1,19 @@
+name: "DEV ADIBROV - Nginx High Active Connections"
+expression: |
+  nginx_connections_active{job="nginx"}
+threshold: 500
+for: "5m"
+condition_type: "gt"
+need_reduce: true
+reducer_type: "max"
+no_data_state: "OK"
+exec_err_state: "Error"
+labels:
+  service: "nginx"
+  severity: "warning"
+  team: "infra"
+summary: |
+  Много активных соединений nginx: {{ $values.B.Value }} на {{ $labels.instance }}
+description: |
+  Количество активных соединений nginx на {{ $labels.instance }} превышает 500.
+  Возможна высокая нагрузка или атака.
--- a/environments/dev/adibrov/alerts/node/node_disk_io_high.yaml
+++ b/environments/dev/adibrov/alerts/node/node_disk_io_high.yaml
@ -0,0 +1,19 @@
+name: "DEV ADIBROV - High Disk IO"
+expression: |
+  rate(node_disk_io_time_seconds_total{job="node_exporter", device!~"dm-.*"}[5m]) * 100
+threshold: 90
+for: "10m"
+condition_type: "gt"
+need_reduce: true
+reducer_type: "max"
+no_data_state: "OK"
+exec_err_state: "Error"
+labels:
+  service: "system"
+  severity: "warning"
+  team: "infra"
+summary: |
+  Высокая нагрузка на диск: {{ printf "%.0f" $values.B.Value }}% IO на {{ $labels.instance }} ({{ $labels.device }})
+description: |
+  Устройство {{ $labels.device }} на {{ $labels.instance }} занято на {{ printf "%.0f" $values.B.Value }}% в течение 10 минут.
+  Высокая нагрузка на IO может замедлять все сервисы на хосте.
--- a/environments/dev/adibrov/alerts/node/node_disk_space_low.yaml
+++ b/environments/dev/adibrov/alerts/node/node_disk_space_low.yaml
@ -0,0 +1,22 @@
+name: "DEV ADIBROV - Disk Space Low"
+expression: |
+  (
+    1 - node_filesystem_avail_bytes{job="node_exporter", fstype=~"ext.|xfs|zfs", mountpoint!~"^(/sys|/proc|/dev|/run|/boot).*"}
+    / node_filesystem_size_bytes{job="node_exporter", fstype=~"ext.|xfs|zfs", mountpoint!~"^(/sys|/proc|/dev|/run|/boot).*"}
+  ) * 100
+threshold: 85
+for: "5m"
+condition_type: "gt"
+need_reduce: true
+reducer_type: "max"
+no_data_state: "OK"
+exec_err_state: "Error"
+labels:
+  service: "system"
+  severity: "warning"
+  team: "infra"
+summary: |
+  Мало места на диске: {{ printf "%.0f" $values.B.Value }}% занято на {{ $labels.mountpoint }} ({{ $labels.instance }})
+description: |
+  На {{ $labels.instance }} точка монтирования {{ $labels.mountpoint }} заполнена на {{ printf "%.0f" $values.B.Value }}%.
+  При достижении 100% запись на диск станет невозможной — сервисы начнут падать.
--- a/environments/dev/adibrov/alerts/node/node_high_cpu.yaml
+++ b/environments/dev/adibrov/alerts/node/node_high_cpu.yaml
@ -0,0 +1,21 @@
+name: "DEV ADIBROV - High CPU Usage"
+expression: |
+  (
+    1 - avg by(instance) (rate(node_cpu_seconds_total{job="node_exporter", mode="idle"}[5m]))
+  ) * 100
+threshold: 85
+for: "5m"
+condition_type: "gt"
+need_reduce: true
+reducer_type: "max"
+no_data_state: "OK"
+exec_err_state: "Error"
+labels:
+  service: "system"
+  severity: "warning"
+  team: "infra"
+summary: |
+  Высокая нагрузка CPU: {{ printf "%.0f" $values.B.Value }}% на {{ $labels.instance }}
+description: |
+  Использование CPU на {{ $labels.instance }} превышает 85% на протяжении 5 минут.
+  Это может привести к деградации производительности всех сервисов на хосте.
--- a/environments/dev/adibrov/alerts/node/node_high_load.yaml
+++ b/environments/dev/adibrov/alerts/node/node_high_load.yaml
@ -0,0 +1,19 @@
+name: "DEV ADIBROV - High System Load"
+expression: |
+  node_load5{job="node_exporter"} / on(instance) machine_cpu_cores{job="cadvisor"}
+threshold: 2
+for: "10m"
+condition_type: "gt"
+need_reduce: true
+reducer_type: "max"
+no_data_state: "OK"
+exec_err_state: "Error"
+labels:
+  service: "system"
+  severity: "warning"
+  team: "infra"
+summary: |
+  Высокий LA на {{ $labels.instance }}: {{ printf "%.2f" $values.B.Value }} на ядро
+description: |
+  Средняя нагрузка (load average 5m) на {{ $labels.instance }} превышает 2x количество ядер CPU.
+  Система перегружена — процессы ждут в очереди на выполнение.
--- a/environments/dev/adibrov/alerts/node/node_high_memory.yaml
+++ b/environments/dev/adibrov/alerts/node/node_high_memory.yaml
@ -0,0 +1,21 @@
+name: "DEV ADIBROV - High Memory Usage"
+expression: |
+  (
+    1 - (node_memory_MemAvailable_bytes{job="node_exporter"} / node_memory_MemTotal_bytes{job="node_exporter"})
+  ) * 100
+threshold: 90
+for: "5m"
+condition_type: "gt"
+need_reduce: true
+reducer_type: "max"
+no_data_state: "OK"
+exec_err_state: "Error"
+labels:
+  service: "system"
+  severity: "warning"
+  team: "infra"
+summary: |
+  Высокое использование памяти: {{ printf "%.0f" $values.B.Value }}% на {{ $labels.instance }}
+description: |
+  На {{ $labels.instance }} занято {{ printf "%.0f" $values.B.Value }}% оперативной памяти.
+  При исчерпании памяти ядро начнёт убивать процессы (OOM killer).
--- a/environments/dev/adibrov/alerts/node/node_instance_down.yaml
+++ b/environments/dev/adibrov/alerts/node/node_instance_down.yaml
@ -0,0 +1,19 @@
+name: "DEV ADIBROV - Node Instance Down"
+expression: |
+  up{job="node_exporter"}
+threshold: 1
+for: "2m"
+condition_type: "lt"
+need_reduce: true
+reducer_type: "min"
+no_data_state: "Alerting"
+exec_err_state: "Alerting"
+labels:
+  service: "system"
+  severity: "critical"
+  team: "infra"
+summary: |
+  Хост недоступен: {{ $labels.instance }}
+description: |
+  Node exporter на {{ $labels.instance }} не отвечает уже более 2 минут.
+  Хост может быть недоступен или упал node_exporter.
--- a/environments/dev/adibrov/alerts/postgres/postgres_down.yaml
+++ b/environments/dev/adibrov/alerts/postgres/postgres_down.yaml
@ -0,0 +1,19 @@
+name: "DEV ADIBROV - PostgreSQL Down"
+expression: |
+  pg_up{job="postgres"}
+threshold: 1
+for: "2m"
+condition_type: "lt"
+need_reduce: true
+reducer_type: "min"
+no_data_state: "Alerting"
+exec_err_state: "Alerting"
+labels:
+  service: "postgres"
+  severity: "critical"
+  team: "infra"
+summary: |
+  PostgreSQL недоступен на {{ $labels.instance }}
+description: |
+  Exporter не может подключиться к PostgreSQL на {{ $labels.instance }}.
+  База данных либо упала, либо недоступна по сети.
--- a/environments/dev/adibrov/alerts/postgres/postgres_long_transactions.yaml
+++ b/environments/dev/adibrov/alerts/postgres/postgres_long_transactions.yaml
@ -0,0 +1,21 @@
+name: "DEV ADIBROV - PostgreSQL Long Running Transactions"
+expression: |
+  max by(instance) (
+    pg_stat_activity_max_tx_duration{job="postgres", state="active"}
+  )
+threshold: 300
+for: "5m"
+condition_type: "gt"
+need_reduce: true
+reducer_type: "max"
+no_data_state: "OK"
+exec_err_state: "Error"
+labels:
+  service: "postgres"
+  severity: "warning"
+  team: "infra"
+summary: |
+  Долгая транзакция в PostgreSQL на {{ $labels.instance }}: {{ printf "%.0f" $values.B.Value }}с
+description: |
+  На {{ $labels.instance }} есть транзакция, выполняющаяся более 5 минут.
+  Долгие транзакции блокируют vacuum и могут накапливать bloat.
--- a/environments/dev/adibrov/alerts/postgres/postgres_too_many_connections.yaml
+++ b/environments/dev/adibrov/alerts/postgres/postgres_too_many_connections.yaml
@ -0,0 +1,22 @@
+name: "DEV ADIBROV - PostgreSQL Too Many Connections"
+expression: |
+  (
+    pg_stat_activity_count{job="postgres"}
+    / pg_settings_max_connections{job="postgres"}
+  ) * 100
+threshold: 80
+for: "5m"
+condition_type: "gt"
+need_reduce: true
+reducer_type: "max"
+no_data_state: "OK"
+exec_err_state: "Error"
+labels:
+  service: "postgres"
+  severity: "warning"
+  team: "infra"
+summary: |
+  PostgreSQL: {{ printf "%.0f" $values.B.Value }}% соединений занято на {{ $labels.instance }}
+description: |
+  На {{ $labels.instance }} занято {{ printf "%.0f" $values.B.Value }}% от max_connections PostgreSQL.
+  При достижении лимита новые подключения будут отклоняться с ошибкой.
--- a/environments/dev/adibrov/terraform.tfvars
+++ b/environments/dev/adibrov/terraform.tfvars
@ -54,7 +54,61 @@ groups = [
    dashboard_path_if_exist     = "dashboards/self-monitoring"
    keep_manual_changes         = false
    prevent_destroy_on_recreate = false
-    alerts_on_datasources_uid   = ["prometheus-local-1"]
+    alerts_on_datasources_uid   = ["prometheus"]
+  },
+  {
+    dashboard_alert_group_name  = "Node Alerts"
+    folder_uid                  = "node"
+    alert_definitions_path      = "alerts/node"
+    dashboard_path_if_exist     = "dashboards/system"
+    keep_manual_changes         = false
+    prevent_destroy_on_recreate = false
+    alerts_on_datasources_uid   = ["prometheus"]
+  },
+  {
+    dashboard_alert_group_name  = "Container Alerts"
+    folder_uid                  = "containers"
+    alert_definitions_path      = "alerts/containers"
+    dashboard_path_if_exist     = ""
+    keep_manual_changes         = false
+    prevent_destroy_on_recreate = false
+    alerts_on_datasources_uid   = ["prometheus"]
+  },
+  {
+    dashboard_alert_group_name  = "Nginx Alerts"
+    folder_uid                  = "nginx"
+    alert_definitions_path      = "alerts/nginx"
+    dashboard_path_if_exist     = ""
+    keep_manual_changes         = false
+    prevent_destroy_on_recreate = false
+    alerts_on_datasources_uid   = ["prometheus"]
+  },
+  {
+    dashboard_alert_group_name  = "PostgreSQL Alerts"
+    folder_uid                  = "postgres"
+    alert_definitions_path      = "alerts/postgres"
+    dashboard_path_if_exist     = ""
+    keep_manual_changes         = false
+    prevent_destroy_on_recreate = false
+    alerts_on_datasources_uid   = ["prometheus"]
+  },
+  {
+    dashboard_alert_group_name  = "Gitea Alerts"
+    folder_uid                  = "gitea"
+    alert_definitions_path      = "alerts/gitea"
+    dashboard_path_if_exist     = ""
+    keep_manual_changes         = false
+    prevent_destroy_on_recreate = false
+    alerts_on_datasources_uid   = ["prometheus"]
+  },
+  {
+    dashboard_alert_group_name  = "Blackbox Alerts"
+    folder_uid                  = "blackbox"
+    alert_definitions_path      = "alerts/blackbox"
+    dashboard_path_if_exist     = ""
+    keep_manual_changes         = false
+    prevent_destroy_on_recreate = false
+    alerts_on_datasources_uid   = ["prometheus"]
  }
 ]
 # Data sources configuration