From 5af763ebb145ebcd1cb3b0095b5ab84cbdb7ff9f Mon Sep 17 00:00:00 2001 From: Alexandr Date: Fri, 3 Apr 2026 11:34:08 +0300 Subject: [PATCH] feat: add postgres/gitea/blackbox alerts and more node alerts --- .../alerts/blackbox/endpoint_down.yaml | 19 +++++++ .../alerts/blackbox/ssl_cert_expiry.yaml | 19 +++++++ .../containers/container_high_memory.yaml | 22 ++++++++ .../alerts/containers/container_oom.yaml | 19 +++++++ .../dev/adibrov/alerts/gitea/gitea_down.yaml | 19 +++++++ .../alerts/gitea/gitea_open_issues_spike.yaml | 19 +++++++ .../dev/adibrov/alerts/nginx/nginx_down.yaml | 19 +++++++ .../alerts/nginx/nginx_high_connections.yaml | 19 +++++++ .../alerts/node/node_disk_io_high.yaml | 19 +++++++ .../alerts/node/node_disk_space_low.yaml | 22 ++++++++ .../adibrov/alerts/node/node_high_cpu.yaml | 21 +++++++ .../adibrov/alerts/node/node_high_load.yaml | 19 +++++++ .../adibrov/alerts/node/node_high_memory.yaml | 21 +++++++ .../alerts/node/node_instance_down.yaml | 19 +++++++ .../alerts/postgres/postgres_down.yaml | 19 +++++++ .../postgres/postgres_long_transactions.yaml | 21 +++++++ .../postgres_too_many_connections.yaml | 22 ++++++++ environments/dev/adibrov/terraform.tfvars | 56 ++++++++++++++++++- 18 files changed, 393 insertions(+), 1 deletion(-) create mode 100644 environments/dev/adibrov/alerts/blackbox/endpoint_down.yaml create mode 100644 environments/dev/adibrov/alerts/blackbox/ssl_cert_expiry.yaml create mode 100644 environments/dev/adibrov/alerts/containers/container_high_memory.yaml create mode 100644 environments/dev/adibrov/alerts/containers/container_oom.yaml create mode 100644 environments/dev/adibrov/alerts/gitea/gitea_down.yaml create mode 100644 environments/dev/adibrov/alerts/gitea/gitea_open_issues_spike.yaml create mode 100644 environments/dev/adibrov/alerts/nginx/nginx_down.yaml create mode 100644 environments/dev/adibrov/alerts/nginx/nginx_high_connections.yaml create mode 100644 environments/dev/adibrov/alerts/node/node_disk_io_high.yaml create mode 100644 environments/dev/adibrov/alerts/node/node_disk_space_low.yaml create mode 100644 environments/dev/adibrov/alerts/node/node_high_cpu.yaml create mode 100644 environments/dev/adibrov/alerts/node/node_high_load.yaml create mode 100644 environments/dev/adibrov/alerts/node/node_high_memory.yaml create mode 100644 environments/dev/adibrov/alerts/node/node_instance_down.yaml create mode 100644 environments/dev/adibrov/alerts/postgres/postgres_down.yaml create mode 100644 environments/dev/adibrov/alerts/postgres/postgres_long_transactions.yaml create mode 100644 environments/dev/adibrov/alerts/postgres/postgres_too_many_connections.yaml diff --git a/environments/dev/adibrov/alerts/blackbox/endpoint_down.yaml b/environments/dev/adibrov/alerts/blackbox/endpoint_down.yaml new file mode 100644 index 0000000..d63e8a0 --- /dev/null +++ b/environments/dev/adibrov/alerts/blackbox/endpoint_down.yaml @@ -0,0 +1,19 @@ +name: "DEV ADIBROV - Endpoint Down (Blackbox)" +expression: | + probe_success{job="blackbox"} +threshold: 1 +for: "3m" +condition_type: "lt" +need_reduce: true +reducer_type: "min" +no_data_state: "Alerting" +exec_err_state: "Alerting" +labels: + service: "blackbox" + severity: "critical" + team: "infra" +summary: | + Endpoint недоступен: {{ $labels.instance }} +description: | + Blackbox exporter не может достучаться до {{ $labels.instance }}. + Сервис недоступен снаружи уже более 3 минут. diff --git a/environments/dev/adibrov/alerts/blackbox/ssl_cert_expiry.yaml b/environments/dev/adibrov/alerts/blackbox/ssl_cert_expiry.yaml new file mode 100644 index 0000000..62b1d31 --- /dev/null +++ b/environments/dev/adibrov/alerts/blackbox/ssl_cert_expiry.yaml @@ -0,0 +1,19 @@ +name: "DEV ADIBROV - SSL Certificate Expiring Soon" +expression: | + (probe_ssl_earliest_cert_expiry{job="blackbox"} - time()) / 86400 +threshold: 14 +for: "1h" +condition_type: "lt" +need_reduce: true +reducer_type: "min" +no_data_state: "OK" +exec_err_state: "Error" +labels: + service: "blackbox" + severity: "warning" + team: "infra" +summary: | + SSL сертификат истекает через {{ printf "%.0f" $values.B.Value }} дней: {{ $labels.instance }} +description: | + SSL сертификат для {{ $labels.instance }} истекает менее чем через 14 дней. + Необходимо обновить сертификат до истечения срока действия. diff --git a/environments/dev/adibrov/alerts/containers/container_high_memory.yaml b/environments/dev/adibrov/alerts/containers/container_high_memory.yaml new file mode 100644 index 0000000..e3f5f09 --- /dev/null +++ b/environments/dev/adibrov/alerts/containers/container_high_memory.yaml @@ -0,0 +1,22 @@ +name: "DEV ADIBROV - Container High Memory Usage" +expression: | + ( + container_memory_working_set_bytes{job="cadvisor", name!="", container!=""} + / container_spec_memory_limit_bytes{job="cadvisor", name!="", container!=""} + ) * 100 +threshold: 90 +for: "5m" +condition_type: "gt" +need_reduce: true +reducer_type: "max" +no_data_state: "OK" +exec_err_state: "Error" +labels: + service: "containers" + severity: "warning" + team: "infra" +summary: | + Контейнер {{ $labels.name }} использует {{ printf "%.0f" $values.B.Value }}% лимита памяти +description: | + Контейнер {{ $labels.name }} на {{ $labels.instance }} использует {{ printf "%.0f" $values.B.Value }}% от лимита памяти. + При достижении 100% контейнер будет убит OOM killer. diff --git a/environments/dev/adibrov/alerts/containers/container_oom.yaml b/environments/dev/adibrov/alerts/containers/container_oom.yaml new file mode 100644 index 0000000..d9a71a9 --- /dev/null +++ b/environments/dev/adibrov/alerts/containers/container_oom.yaml @@ -0,0 +1,19 @@ +name: "DEV ADIBROV - Container OOM Killed" +expression: | + increase(container_oom_events_total{job="cadvisor", name!=""}[5m]) +threshold: 0 +for: "1m" +condition_type: "gt" +need_reduce: true +reducer_type: "sum" +no_data_state: "OK" +exec_err_state: "Error" +labels: + service: "containers" + severity: "critical" + team: "infra" +summary: | + Контейнер {{ $labels.name }} убит OOM killer на {{ $labels.instance }} +description: | + Контейнер {{ $labels.name }} на {{ $labels.instance }} был убит ядром из-за нехватки памяти. + Нужно проверить лимиты памяти контейнера и текущее потребление. diff --git a/environments/dev/adibrov/alerts/gitea/gitea_down.yaml b/environments/dev/adibrov/alerts/gitea/gitea_down.yaml new file mode 100644 index 0000000..785e32d --- /dev/null +++ b/environments/dev/adibrov/alerts/gitea/gitea_down.yaml @@ -0,0 +1,19 @@ +name: "DEV ADIBROV - Gitea Down" +expression: | + gitea_build_info{job="gitea"} +threshold: 1 +for: "2m" +condition_type: "lt" +need_reduce: true +reducer_type: "min" +no_data_state: "Alerting" +exec_err_state: "Alerting" +labels: + service: "gitea" + severity: "critical" + team: "infra" +summary: | + Gitea недоступна на {{ $labels.instance }} +description: | + Метрики Gitea не поступают с {{ $labels.instance }}. + Сервис Gitea либо упал, либо недоступен scrape endpoint. diff --git a/environments/dev/adibrov/alerts/gitea/gitea_open_issues_spike.yaml b/environments/dev/adibrov/alerts/gitea/gitea_open_issues_spike.yaml new file mode 100644 index 0000000..23dbf86 --- /dev/null +++ b/environments/dev/adibrov/alerts/gitea/gitea_open_issues_spike.yaml @@ -0,0 +1,19 @@ +name: "DEV ADIBROV - Gitea Open Issues Spike" +expression: | + increase(gitea_issues_open{job="gitea"}[1h]) +threshold: 50 +for: "5m" +condition_type: "gt" +need_reduce: true +reducer_type: "max" +no_data_state: "OK" +exec_err_state: "Error" +labels: + service: "gitea" + severity: "warning" + team: "infra" +summary: | + Всплеск открытых issue в Gitea: +{{ printf "%.0f" $values.B.Value }} за час +description: | + За последний час количество открытых issues в Gitea выросло на {{ printf "%.0f" $values.B.Value }}. + Возможна массовая автоматическая генерация задач или проблема с интеграцией. diff --git a/environments/dev/adibrov/alerts/nginx/nginx_down.yaml b/environments/dev/adibrov/alerts/nginx/nginx_down.yaml new file mode 100644 index 0000000..d51ba9f --- /dev/null +++ b/environments/dev/adibrov/alerts/nginx/nginx_down.yaml @@ -0,0 +1,19 @@ +name: "DEV ADIBROV - Nginx Down" +expression: | + nginx_up{job="nginx"} +threshold: 1 +for: "2m" +condition_type: "lt" +need_reduce: true +reducer_type: "min" +no_data_state: "Alerting" +exec_err_state: "Alerting" +labels: + service: "nginx" + severity: "critical" + team: "infra" +summary: | + Nginx недоступен на {{ $labels.instance }} +description: | + Nginx exporter не может подключиться к nginx на {{ $labels.instance }}. + Nginx либо упал, либо недоступен его status page. diff --git a/environments/dev/adibrov/alerts/nginx/nginx_high_connections.yaml b/environments/dev/adibrov/alerts/nginx/nginx_high_connections.yaml new file mode 100644 index 0000000..49bae52 --- /dev/null +++ b/environments/dev/adibrov/alerts/nginx/nginx_high_connections.yaml @@ -0,0 +1,19 @@ +name: "DEV ADIBROV - Nginx High Active Connections" +expression: | + nginx_connections_active{job="nginx"} +threshold: 500 +for: "5m" +condition_type: "gt" +need_reduce: true +reducer_type: "max" +no_data_state: "OK" +exec_err_state: "Error" +labels: + service: "nginx" + severity: "warning" + team: "infra" +summary: | + Много активных соединений nginx: {{ $values.B.Value }} на {{ $labels.instance }} +description: | + Количество активных соединений nginx на {{ $labels.instance }} превышает 500. + Возможна высокая нагрузка или атака. diff --git a/environments/dev/adibrov/alerts/node/node_disk_io_high.yaml b/environments/dev/adibrov/alerts/node/node_disk_io_high.yaml new file mode 100644 index 0000000..a32ed50 --- /dev/null +++ b/environments/dev/adibrov/alerts/node/node_disk_io_high.yaml @@ -0,0 +1,19 @@ +name: "DEV ADIBROV - High Disk IO" +expression: | + rate(node_disk_io_time_seconds_total{job="node_exporter", device!~"dm-.*"}[5m]) * 100 +threshold: 90 +for: "10m" +condition_type: "gt" +need_reduce: true +reducer_type: "max" +no_data_state: "OK" +exec_err_state: "Error" +labels: + service: "system" + severity: "warning" + team: "infra" +summary: | + Высокая нагрузка на диск: {{ printf "%.0f" $values.B.Value }}% IO на {{ $labels.instance }} ({{ $labels.device }}) +description: | + Устройство {{ $labels.device }} на {{ $labels.instance }} занято на {{ printf "%.0f" $values.B.Value }}% в течение 10 минут. + Высокая нагрузка на IO может замедлять все сервисы на хосте. diff --git a/environments/dev/adibrov/alerts/node/node_disk_space_low.yaml b/environments/dev/adibrov/alerts/node/node_disk_space_low.yaml new file mode 100644 index 0000000..13cd78f --- /dev/null +++ b/environments/dev/adibrov/alerts/node/node_disk_space_low.yaml @@ -0,0 +1,22 @@ +name: "DEV ADIBROV - Disk Space Low" +expression: | + ( + 1 - node_filesystem_avail_bytes{job="node_exporter", fstype=~"ext.|xfs|zfs", mountpoint!~"^(/sys|/proc|/dev|/run|/boot).*"} + / node_filesystem_size_bytes{job="node_exporter", fstype=~"ext.|xfs|zfs", mountpoint!~"^(/sys|/proc|/dev|/run|/boot).*"} + ) * 100 +threshold: 85 +for: "5m" +condition_type: "gt" +need_reduce: true +reducer_type: "max" +no_data_state: "OK" +exec_err_state: "Error" +labels: + service: "system" + severity: "warning" + team: "infra" +summary: | + Мало места на диске: {{ printf "%.0f" $values.B.Value }}% занято на {{ $labels.mountpoint }} ({{ $labels.instance }}) +description: | + На {{ $labels.instance }} точка монтирования {{ $labels.mountpoint }} заполнена на {{ printf "%.0f" $values.B.Value }}%. + При достижении 100% запись на диск станет невозможной — сервисы начнут падать. diff --git a/environments/dev/adibrov/alerts/node/node_high_cpu.yaml b/environments/dev/adibrov/alerts/node/node_high_cpu.yaml new file mode 100644 index 0000000..433d15a --- /dev/null +++ b/environments/dev/adibrov/alerts/node/node_high_cpu.yaml @@ -0,0 +1,21 @@ +name: "DEV ADIBROV - High CPU Usage" +expression: | + ( + 1 - avg by(instance) (rate(node_cpu_seconds_total{job="node_exporter", mode="idle"}[5m])) + ) * 100 +threshold: 85 +for: "5m" +condition_type: "gt" +need_reduce: true +reducer_type: "max" +no_data_state: "OK" +exec_err_state: "Error" +labels: + service: "system" + severity: "warning" + team: "infra" +summary: | + Высокая нагрузка CPU: {{ printf "%.0f" $values.B.Value }}% на {{ $labels.instance }} +description: | + Использование CPU на {{ $labels.instance }} превышает 85% на протяжении 5 минут. + Это может привести к деградации производительности всех сервисов на хосте. diff --git a/environments/dev/adibrov/alerts/node/node_high_load.yaml b/environments/dev/adibrov/alerts/node/node_high_load.yaml new file mode 100644 index 0000000..97078a6 --- /dev/null +++ b/environments/dev/adibrov/alerts/node/node_high_load.yaml @@ -0,0 +1,19 @@ +name: "DEV ADIBROV - High System Load" +expression: | + node_load5{job="node_exporter"} / on(instance) machine_cpu_cores{job="cadvisor"} +threshold: 2 +for: "10m" +condition_type: "gt" +need_reduce: true +reducer_type: "max" +no_data_state: "OK" +exec_err_state: "Error" +labels: + service: "system" + severity: "warning" + team: "infra" +summary: | + Высокий LA на {{ $labels.instance }}: {{ printf "%.2f" $values.B.Value }} на ядро +description: | + Средняя нагрузка (load average 5m) на {{ $labels.instance }} превышает 2x количество ядер CPU. + Система перегружена — процессы ждут в очереди на выполнение. diff --git a/environments/dev/adibrov/alerts/node/node_high_memory.yaml b/environments/dev/adibrov/alerts/node/node_high_memory.yaml new file mode 100644 index 0000000..38cf199 --- /dev/null +++ b/environments/dev/adibrov/alerts/node/node_high_memory.yaml @@ -0,0 +1,21 @@ +name: "DEV ADIBROV - High Memory Usage" +expression: | + ( + 1 - (node_memory_MemAvailable_bytes{job="node_exporter"} / node_memory_MemTotal_bytes{job="node_exporter"}) + ) * 100 +threshold: 90 +for: "5m" +condition_type: "gt" +need_reduce: true +reducer_type: "max" +no_data_state: "OK" +exec_err_state: "Error" +labels: + service: "system" + severity: "warning" + team: "infra" +summary: | + Высокое использование памяти: {{ printf "%.0f" $values.B.Value }}% на {{ $labels.instance }} +description: | + На {{ $labels.instance }} занято {{ printf "%.0f" $values.B.Value }}% оперативной памяти. + При исчерпании памяти ядро начнёт убивать процессы (OOM killer). diff --git a/environments/dev/adibrov/alerts/node/node_instance_down.yaml b/environments/dev/adibrov/alerts/node/node_instance_down.yaml new file mode 100644 index 0000000..cc0c807 --- /dev/null +++ b/environments/dev/adibrov/alerts/node/node_instance_down.yaml @@ -0,0 +1,19 @@ +name: "DEV ADIBROV - Node Instance Down" +expression: | + up{job="node_exporter"} +threshold: 1 +for: "2m" +condition_type: "lt" +need_reduce: true +reducer_type: "min" +no_data_state: "Alerting" +exec_err_state: "Alerting" +labels: + service: "system" + severity: "critical" + team: "infra" +summary: | + Хост недоступен: {{ $labels.instance }} +description: | + Node exporter на {{ $labels.instance }} не отвечает уже более 2 минут. + Хост может быть недоступен или упал node_exporter. diff --git a/environments/dev/adibrov/alerts/postgres/postgres_down.yaml b/environments/dev/adibrov/alerts/postgres/postgres_down.yaml new file mode 100644 index 0000000..e4acee2 --- /dev/null +++ b/environments/dev/adibrov/alerts/postgres/postgres_down.yaml @@ -0,0 +1,19 @@ +name: "DEV ADIBROV - PostgreSQL Down" +expression: | + pg_up{job="postgres"} +threshold: 1 +for: "2m" +condition_type: "lt" +need_reduce: true +reducer_type: "min" +no_data_state: "Alerting" +exec_err_state: "Alerting" +labels: + service: "postgres" + severity: "critical" + team: "infra" +summary: | + PostgreSQL недоступен на {{ $labels.instance }} +description: | + Exporter не может подключиться к PostgreSQL на {{ $labels.instance }}. + База данных либо упала, либо недоступна по сети. diff --git a/environments/dev/adibrov/alerts/postgres/postgres_long_transactions.yaml b/environments/dev/adibrov/alerts/postgres/postgres_long_transactions.yaml new file mode 100644 index 0000000..9e4443f --- /dev/null +++ b/environments/dev/adibrov/alerts/postgres/postgres_long_transactions.yaml @@ -0,0 +1,21 @@ +name: "DEV ADIBROV - PostgreSQL Long Running Transactions" +expression: | + max by(instance) ( + pg_stat_activity_max_tx_duration{job="postgres", state="active"} + ) +threshold: 300 +for: "5m" +condition_type: "gt" +need_reduce: true +reducer_type: "max" +no_data_state: "OK" +exec_err_state: "Error" +labels: + service: "postgres" + severity: "warning" + team: "infra" +summary: | + Долгая транзакция в PostgreSQL на {{ $labels.instance }}: {{ printf "%.0f" $values.B.Value }}с +description: | + На {{ $labels.instance }} есть транзакция, выполняющаяся более 5 минут. + Долгие транзакции блокируют vacuum и могут накапливать bloat. diff --git a/environments/dev/adibrov/alerts/postgres/postgres_too_many_connections.yaml b/environments/dev/adibrov/alerts/postgres/postgres_too_many_connections.yaml new file mode 100644 index 0000000..3c0fa01 --- /dev/null +++ b/environments/dev/adibrov/alerts/postgres/postgres_too_many_connections.yaml @@ -0,0 +1,22 @@ +name: "DEV ADIBROV - PostgreSQL Too Many Connections" +expression: | + ( + pg_stat_activity_count{job="postgres"} + / pg_settings_max_connections{job="postgres"} + ) * 100 +threshold: 80 +for: "5m" +condition_type: "gt" +need_reduce: true +reducer_type: "max" +no_data_state: "OK" +exec_err_state: "Error" +labels: + service: "postgres" + severity: "warning" + team: "infra" +summary: | + PostgreSQL: {{ printf "%.0f" $values.B.Value }}% соединений занято на {{ $labels.instance }} +description: | + На {{ $labels.instance }} занято {{ printf "%.0f" $values.B.Value }}% от max_connections PostgreSQL. + При достижении лимита новые подключения будут отклоняться с ошибкой. diff --git a/environments/dev/adibrov/terraform.tfvars b/environments/dev/adibrov/terraform.tfvars index e3df0fb..14e86fe 100644 --- a/environments/dev/adibrov/terraform.tfvars +++ b/environments/dev/adibrov/terraform.tfvars @@ -54,7 +54,61 @@ groups = [ dashboard_path_if_exist = "dashboards/self-monitoring" keep_manual_changes = false prevent_destroy_on_recreate = false - alerts_on_datasources_uid = ["prometheus-local-1"] + alerts_on_datasources_uid = ["prometheus"] + }, + { + dashboard_alert_group_name = "Node Alerts" + folder_uid = "node" + alert_definitions_path = "alerts/node" + dashboard_path_if_exist = "dashboards/system" + keep_manual_changes = false + prevent_destroy_on_recreate = false + alerts_on_datasources_uid = ["prometheus"] + }, + { + dashboard_alert_group_name = "Container Alerts" + folder_uid = "containers" + alert_definitions_path = "alerts/containers" + dashboard_path_if_exist = "" + keep_manual_changes = false + prevent_destroy_on_recreate = false + alerts_on_datasources_uid = ["prometheus"] + }, + { + dashboard_alert_group_name = "Nginx Alerts" + folder_uid = "nginx" + alert_definitions_path = "alerts/nginx" + dashboard_path_if_exist = "" + keep_manual_changes = false + prevent_destroy_on_recreate = false + alerts_on_datasources_uid = ["prometheus"] + }, + { + dashboard_alert_group_name = "PostgreSQL Alerts" + folder_uid = "postgres" + alert_definitions_path = "alerts/postgres" + dashboard_path_if_exist = "" + keep_manual_changes = false + prevent_destroy_on_recreate = false + alerts_on_datasources_uid = ["prometheus"] + }, + { + dashboard_alert_group_name = "Gitea Alerts" + folder_uid = "gitea" + alert_definitions_path = "alerts/gitea" + dashboard_path_if_exist = "" + keep_manual_changes = false + prevent_destroy_on_recreate = false + alerts_on_datasources_uid = ["prometheus"] + }, + { + dashboard_alert_group_name = "Blackbox Alerts" + folder_uid = "blackbox" + alert_definitions_path = "alerts/blackbox" + dashboard_path_if_exist = "" + keep_manual_changes = false + prevent_destroy_on_recreate = false + alerts_on_datasources_uid = ["prometheus"] } ] # Data sources configuration