feat: add postgres/gitea/blackbox alerts and more node alerts
This commit is contained in:
19
environments/dev/adibrov/alerts/blackbox/endpoint_down.yaml
Normal file
19
environments/dev/adibrov/alerts/blackbox/endpoint_down.yaml
Normal file
@ -0,0 +1,19 @@
|
||||
name: "DEV ADIBROV - Endpoint Down (Blackbox)"
|
||||
expression: |
|
||||
probe_success{job="blackbox"}
|
||||
threshold: 1
|
||||
for: "3m"
|
||||
condition_type: "lt"
|
||||
need_reduce: true
|
||||
reducer_type: "min"
|
||||
no_data_state: "Alerting"
|
||||
exec_err_state: "Alerting"
|
||||
labels:
|
||||
service: "blackbox"
|
||||
severity: "critical"
|
||||
team: "infra"
|
||||
summary: |
|
||||
Endpoint недоступен: {{ $labels.instance }}
|
||||
description: |
|
||||
Blackbox exporter не может достучаться до {{ $labels.instance }}.
|
||||
Сервис недоступен снаружи уже более 3 минут.
|
||||
@ -0,0 +1,19 @@
|
||||
name: "DEV ADIBROV - SSL Certificate Expiring Soon"
|
||||
expression: |
|
||||
(probe_ssl_earliest_cert_expiry{job="blackbox"} - time()) / 86400
|
||||
threshold: 14
|
||||
for: "1h"
|
||||
condition_type: "lt"
|
||||
need_reduce: true
|
||||
reducer_type: "min"
|
||||
no_data_state: "OK"
|
||||
exec_err_state: "Error"
|
||||
labels:
|
||||
service: "blackbox"
|
||||
severity: "warning"
|
||||
team: "infra"
|
||||
summary: |
|
||||
SSL сертификат истекает через {{ printf "%.0f" $values.B.Value }} дней: {{ $labels.instance }}
|
||||
description: |
|
||||
SSL сертификат для {{ $labels.instance }} истекает менее чем через 14 дней.
|
||||
Необходимо обновить сертификат до истечения срока действия.
|
||||
@ -0,0 +1,22 @@
|
||||
name: "DEV ADIBROV - Container High Memory Usage"
|
||||
expression: |
|
||||
(
|
||||
container_memory_working_set_bytes{job="cadvisor", name!="", container!=""}
|
||||
/ container_spec_memory_limit_bytes{job="cadvisor", name!="", container!=""}
|
||||
) * 100
|
||||
threshold: 90
|
||||
for: "5m"
|
||||
condition_type: "gt"
|
||||
need_reduce: true
|
||||
reducer_type: "max"
|
||||
no_data_state: "OK"
|
||||
exec_err_state: "Error"
|
||||
labels:
|
||||
service: "containers"
|
||||
severity: "warning"
|
||||
team: "infra"
|
||||
summary: |
|
||||
Контейнер {{ $labels.name }} использует {{ printf "%.0f" $values.B.Value }}% лимита памяти
|
||||
description: |
|
||||
Контейнер {{ $labels.name }} на {{ $labels.instance }} использует {{ printf "%.0f" $values.B.Value }}% от лимита памяти.
|
||||
При достижении 100% контейнер будет убит OOM killer.
|
||||
@ -0,0 +1,19 @@
|
||||
name: "DEV ADIBROV - Container OOM Killed"
|
||||
expression: |
|
||||
increase(container_oom_events_total{job="cadvisor", name!=""}[5m])
|
||||
threshold: 0
|
||||
for: "1m"
|
||||
condition_type: "gt"
|
||||
need_reduce: true
|
||||
reducer_type: "sum"
|
||||
no_data_state: "OK"
|
||||
exec_err_state: "Error"
|
||||
labels:
|
||||
service: "containers"
|
||||
severity: "critical"
|
||||
team: "infra"
|
||||
summary: |
|
||||
Контейнер {{ $labels.name }} убит OOM killer на {{ $labels.instance }}
|
||||
description: |
|
||||
Контейнер {{ $labels.name }} на {{ $labels.instance }} был убит ядром из-за нехватки памяти.
|
||||
Нужно проверить лимиты памяти контейнера и текущее потребление.
|
||||
19
environments/dev/adibrov/alerts/gitea/gitea_down.yaml
Normal file
19
environments/dev/adibrov/alerts/gitea/gitea_down.yaml
Normal file
@ -0,0 +1,19 @@
|
||||
name: "DEV ADIBROV - Gitea Down"
|
||||
expression: |
|
||||
gitea_build_info{job="gitea"}
|
||||
threshold: 1
|
||||
for: "2m"
|
||||
condition_type: "lt"
|
||||
need_reduce: true
|
||||
reducer_type: "min"
|
||||
no_data_state: "Alerting"
|
||||
exec_err_state: "Alerting"
|
||||
labels:
|
||||
service: "gitea"
|
||||
severity: "critical"
|
||||
team: "infra"
|
||||
summary: |
|
||||
Gitea недоступна на {{ $labels.instance }}
|
||||
description: |
|
||||
Метрики Gitea не поступают с {{ $labels.instance }}.
|
||||
Сервис Gitea либо упал, либо недоступен scrape endpoint.
|
||||
@ -0,0 +1,19 @@
|
||||
name: "DEV ADIBROV - Gitea Open Issues Spike"
|
||||
expression: |
|
||||
increase(gitea_issues_open{job="gitea"}[1h])
|
||||
threshold: 50
|
||||
for: "5m"
|
||||
condition_type: "gt"
|
||||
need_reduce: true
|
||||
reducer_type: "max"
|
||||
no_data_state: "OK"
|
||||
exec_err_state: "Error"
|
||||
labels:
|
||||
service: "gitea"
|
||||
severity: "warning"
|
||||
team: "infra"
|
||||
summary: |
|
||||
Всплеск открытых issue в Gitea: +{{ printf "%.0f" $values.B.Value }} за час
|
||||
description: |
|
||||
За последний час количество открытых issues в Gitea выросло на {{ printf "%.0f" $values.B.Value }}.
|
||||
Возможна массовая автоматическая генерация задач или проблема с интеграцией.
|
||||
19
environments/dev/adibrov/alerts/nginx/nginx_down.yaml
Normal file
19
environments/dev/adibrov/alerts/nginx/nginx_down.yaml
Normal file
@ -0,0 +1,19 @@
|
||||
name: "DEV ADIBROV - Nginx Down"
|
||||
expression: |
|
||||
nginx_up{job="nginx"}
|
||||
threshold: 1
|
||||
for: "2m"
|
||||
condition_type: "lt"
|
||||
need_reduce: true
|
||||
reducer_type: "min"
|
||||
no_data_state: "Alerting"
|
||||
exec_err_state: "Alerting"
|
||||
labels:
|
||||
service: "nginx"
|
||||
severity: "critical"
|
||||
team: "infra"
|
||||
summary: |
|
||||
Nginx недоступен на {{ $labels.instance }}
|
||||
description: |
|
||||
Nginx exporter не может подключиться к nginx на {{ $labels.instance }}.
|
||||
Nginx либо упал, либо недоступен его status page.
|
||||
@ -0,0 +1,19 @@
|
||||
name: "DEV ADIBROV - Nginx High Active Connections"
|
||||
expression: |
|
||||
nginx_connections_active{job="nginx"}
|
||||
threshold: 500
|
||||
for: "5m"
|
||||
condition_type: "gt"
|
||||
need_reduce: true
|
||||
reducer_type: "max"
|
||||
no_data_state: "OK"
|
||||
exec_err_state: "Error"
|
||||
labels:
|
||||
service: "nginx"
|
||||
severity: "warning"
|
||||
team: "infra"
|
||||
summary: |
|
||||
Много активных соединений nginx: {{ $values.B.Value }} на {{ $labels.instance }}
|
||||
description: |
|
||||
Количество активных соединений nginx на {{ $labels.instance }} превышает 500.
|
||||
Возможна высокая нагрузка или атака.
|
||||
19
environments/dev/adibrov/alerts/node/node_disk_io_high.yaml
Normal file
19
environments/dev/adibrov/alerts/node/node_disk_io_high.yaml
Normal file
@ -0,0 +1,19 @@
|
||||
name: "DEV ADIBROV - High Disk IO"
|
||||
expression: |
|
||||
rate(node_disk_io_time_seconds_total{job="node_exporter", device!~"dm-.*"}[5m]) * 100
|
||||
threshold: 90
|
||||
for: "10m"
|
||||
condition_type: "gt"
|
||||
need_reduce: true
|
||||
reducer_type: "max"
|
||||
no_data_state: "OK"
|
||||
exec_err_state: "Error"
|
||||
labels:
|
||||
service: "system"
|
||||
severity: "warning"
|
||||
team: "infra"
|
||||
summary: |
|
||||
Высокая нагрузка на диск: {{ printf "%.0f" $values.B.Value }}% IO на {{ $labels.instance }} ({{ $labels.device }})
|
||||
description: |
|
||||
Устройство {{ $labels.device }} на {{ $labels.instance }} занято на {{ printf "%.0f" $values.B.Value }}% в течение 10 минут.
|
||||
Высокая нагрузка на IO может замедлять все сервисы на хосте.
|
||||
@ -0,0 +1,22 @@
|
||||
name: "DEV ADIBROV - Disk Space Low"
|
||||
expression: |
|
||||
(
|
||||
1 - node_filesystem_avail_bytes{job="node_exporter", fstype=~"ext.|xfs|zfs", mountpoint!~"^(/sys|/proc|/dev|/run|/boot).*"}
|
||||
/ node_filesystem_size_bytes{job="node_exporter", fstype=~"ext.|xfs|zfs", mountpoint!~"^(/sys|/proc|/dev|/run|/boot).*"}
|
||||
) * 100
|
||||
threshold: 85
|
||||
for: "5m"
|
||||
condition_type: "gt"
|
||||
need_reduce: true
|
||||
reducer_type: "max"
|
||||
no_data_state: "OK"
|
||||
exec_err_state: "Error"
|
||||
labels:
|
||||
service: "system"
|
||||
severity: "warning"
|
||||
team: "infra"
|
||||
summary: |
|
||||
Мало места на диске: {{ printf "%.0f" $values.B.Value }}% занято на {{ $labels.mountpoint }} ({{ $labels.instance }})
|
||||
description: |
|
||||
На {{ $labels.instance }} точка монтирования {{ $labels.mountpoint }} заполнена на {{ printf "%.0f" $values.B.Value }}%.
|
||||
При достижении 100% запись на диск станет невозможной — сервисы начнут падать.
|
||||
21
environments/dev/adibrov/alerts/node/node_high_cpu.yaml
Normal file
21
environments/dev/adibrov/alerts/node/node_high_cpu.yaml
Normal file
@ -0,0 +1,21 @@
|
||||
name: "DEV ADIBROV - High CPU Usage"
|
||||
expression: |
|
||||
(
|
||||
1 - avg by(instance) (rate(node_cpu_seconds_total{job="node_exporter", mode="idle"}[5m]))
|
||||
) * 100
|
||||
threshold: 85
|
||||
for: "5m"
|
||||
condition_type: "gt"
|
||||
need_reduce: true
|
||||
reducer_type: "max"
|
||||
no_data_state: "OK"
|
||||
exec_err_state: "Error"
|
||||
labels:
|
||||
service: "system"
|
||||
severity: "warning"
|
||||
team: "infra"
|
||||
summary: |
|
||||
Высокая нагрузка CPU: {{ printf "%.0f" $values.B.Value }}% на {{ $labels.instance }}
|
||||
description: |
|
||||
Использование CPU на {{ $labels.instance }} превышает 85% на протяжении 5 минут.
|
||||
Это может привести к деградации производительности всех сервисов на хосте.
|
||||
19
environments/dev/adibrov/alerts/node/node_high_load.yaml
Normal file
19
environments/dev/adibrov/alerts/node/node_high_load.yaml
Normal file
@ -0,0 +1,19 @@
|
||||
name: "DEV ADIBROV - High System Load"
|
||||
expression: |
|
||||
node_load5{job="node_exporter"} / on(instance) machine_cpu_cores{job="cadvisor"}
|
||||
threshold: 2
|
||||
for: "10m"
|
||||
condition_type: "gt"
|
||||
need_reduce: true
|
||||
reducer_type: "max"
|
||||
no_data_state: "OK"
|
||||
exec_err_state: "Error"
|
||||
labels:
|
||||
service: "system"
|
||||
severity: "warning"
|
||||
team: "infra"
|
||||
summary: |
|
||||
Высокий LA на {{ $labels.instance }}: {{ printf "%.2f" $values.B.Value }} на ядро
|
||||
description: |
|
||||
Средняя нагрузка (load average 5m) на {{ $labels.instance }} превышает 2x количество ядер CPU.
|
||||
Система перегружена — процессы ждут в очереди на выполнение.
|
||||
21
environments/dev/adibrov/alerts/node/node_high_memory.yaml
Normal file
21
environments/dev/adibrov/alerts/node/node_high_memory.yaml
Normal file
@ -0,0 +1,21 @@
|
||||
name: "DEV ADIBROV - High Memory Usage"
|
||||
expression: |
|
||||
(
|
||||
1 - (node_memory_MemAvailable_bytes{job="node_exporter"} / node_memory_MemTotal_bytes{job="node_exporter"})
|
||||
) * 100
|
||||
threshold: 90
|
||||
for: "5m"
|
||||
condition_type: "gt"
|
||||
need_reduce: true
|
||||
reducer_type: "max"
|
||||
no_data_state: "OK"
|
||||
exec_err_state: "Error"
|
||||
labels:
|
||||
service: "system"
|
||||
severity: "warning"
|
||||
team: "infra"
|
||||
summary: |
|
||||
Высокое использование памяти: {{ printf "%.0f" $values.B.Value }}% на {{ $labels.instance }}
|
||||
description: |
|
||||
На {{ $labels.instance }} занято {{ printf "%.0f" $values.B.Value }}% оперативной памяти.
|
||||
При исчерпании памяти ядро начнёт убивать процессы (OOM killer).
|
||||
19
environments/dev/adibrov/alerts/node/node_instance_down.yaml
Normal file
19
environments/dev/adibrov/alerts/node/node_instance_down.yaml
Normal file
@ -0,0 +1,19 @@
|
||||
name: "DEV ADIBROV - Node Instance Down"
|
||||
expression: |
|
||||
up{job="node_exporter"}
|
||||
threshold: 1
|
||||
for: "2m"
|
||||
condition_type: "lt"
|
||||
need_reduce: true
|
||||
reducer_type: "min"
|
||||
no_data_state: "Alerting"
|
||||
exec_err_state: "Alerting"
|
||||
labels:
|
||||
service: "system"
|
||||
severity: "critical"
|
||||
team: "infra"
|
||||
summary: |
|
||||
Хост недоступен: {{ $labels.instance }}
|
||||
description: |
|
||||
Node exporter на {{ $labels.instance }} не отвечает уже более 2 минут.
|
||||
Хост может быть недоступен или упал node_exporter.
|
||||
19
environments/dev/adibrov/alerts/postgres/postgres_down.yaml
Normal file
19
environments/dev/adibrov/alerts/postgres/postgres_down.yaml
Normal file
@ -0,0 +1,19 @@
|
||||
name: "DEV ADIBROV - PostgreSQL Down"
|
||||
expression: |
|
||||
pg_up{job="postgres"}
|
||||
threshold: 1
|
||||
for: "2m"
|
||||
condition_type: "lt"
|
||||
need_reduce: true
|
||||
reducer_type: "min"
|
||||
no_data_state: "Alerting"
|
||||
exec_err_state: "Alerting"
|
||||
labels:
|
||||
service: "postgres"
|
||||
severity: "critical"
|
||||
team: "infra"
|
||||
summary: |
|
||||
PostgreSQL недоступен на {{ $labels.instance }}
|
||||
description: |
|
||||
Exporter не может подключиться к PostgreSQL на {{ $labels.instance }}.
|
||||
База данных либо упала, либо недоступна по сети.
|
||||
@ -0,0 +1,21 @@
|
||||
name: "DEV ADIBROV - PostgreSQL Long Running Transactions"
|
||||
expression: |
|
||||
max by(instance) (
|
||||
pg_stat_activity_max_tx_duration{job="postgres", state="active"}
|
||||
)
|
||||
threshold: 300
|
||||
for: "5m"
|
||||
condition_type: "gt"
|
||||
need_reduce: true
|
||||
reducer_type: "max"
|
||||
no_data_state: "OK"
|
||||
exec_err_state: "Error"
|
||||
labels:
|
||||
service: "postgres"
|
||||
severity: "warning"
|
||||
team: "infra"
|
||||
summary: |
|
||||
Долгая транзакция в PostgreSQL на {{ $labels.instance }}: {{ printf "%.0f" $values.B.Value }}с
|
||||
description: |
|
||||
На {{ $labels.instance }} есть транзакция, выполняющаяся более 5 минут.
|
||||
Долгие транзакции блокируют vacuum и могут накапливать bloat.
|
||||
@ -0,0 +1,22 @@
|
||||
name: "DEV ADIBROV - PostgreSQL Too Many Connections"
|
||||
expression: |
|
||||
(
|
||||
pg_stat_activity_count{job="postgres"}
|
||||
/ pg_settings_max_connections{job="postgres"}
|
||||
) * 100
|
||||
threshold: 80
|
||||
for: "5m"
|
||||
condition_type: "gt"
|
||||
need_reduce: true
|
||||
reducer_type: "max"
|
||||
no_data_state: "OK"
|
||||
exec_err_state: "Error"
|
||||
labels:
|
||||
service: "postgres"
|
||||
severity: "warning"
|
||||
team: "infra"
|
||||
summary: |
|
||||
PostgreSQL: {{ printf "%.0f" $values.B.Value }}% соединений занято на {{ $labels.instance }}
|
||||
description: |
|
||||
На {{ $labels.instance }} занято {{ printf "%.0f" $values.B.Value }}% от max_connections PostgreSQL.
|
||||
При достижении лимита новые подключения будут отклоняться с ошибкой.
|
||||
@ -54,7 +54,61 @@ groups = [
|
||||
dashboard_path_if_exist = "dashboards/self-monitoring"
|
||||
keep_manual_changes = false
|
||||
prevent_destroy_on_recreate = false
|
||||
alerts_on_datasources_uid = ["prometheus-local-1"]
|
||||
alerts_on_datasources_uid = ["prometheus"]
|
||||
},
|
||||
{
|
||||
dashboard_alert_group_name = "Node Alerts"
|
||||
folder_uid = "node"
|
||||
alert_definitions_path = "alerts/node"
|
||||
dashboard_path_if_exist = "dashboards/system"
|
||||
keep_manual_changes = false
|
||||
prevent_destroy_on_recreate = false
|
||||
alerts_on_datasources_uid = ["prometheus"]
|
||||
},
|
||||
{
|
||||
dashboard_alert_group_name = "Container Alerts"
|
||||
folder_uid = "containers"
|
||||
alert_definitions_path = "alerts/containers"
|
||||
dashboard_path_if_exist = ""
|
||||
keep_manual_changes = false
|
||||
prevent_destroy_on_recreate = false
|
||||
alerts_on_datasources_uid = ["prometheus"]
|
||||
},
|
||||
{
|
||||
dashboard_alert_group_name = "Nginx Alerts"
|
||||
folder_uid = "nginx"
|
||||
alert_definitions_path = "alerts/nginx"
|
||||
dashboard_path_if_exist = ""
|
||||
keep_manual_changes = false
|
||||
prevent_destroy_on_recreate = false
|
||||
alerts_on_datasources_uid = ["prometheus"]
|
||||
},
|
||||
{
|
||||
dashboard_alert_group_name = "PostgreSQL Alerts"
|
||||
folder_uid = "postgres"
|
||||
alert_definitions_path = "alerts/postgres"
|
||||
dashboard_path_if_exist = ""
|
||||
keep_manual_changes = false
|
||||
prevent_destroy_on_recreate = false
|
||||
alerts_on_datasources_uid = ["prometheus"]
|
||||
},
|
||||
{
|
||||
dashboard_alert_group_name = "Gitea Alerts"
|
||||
folder_uid = "gitea"
|
||||
alert_definitions_path = "alerts/gitea"
|
||||
dashboard_path_if_exist = ""
|
||||
keep_manual_changes = false
|
||||
prevent_destroy_on_recreate = false
|
||||
alerts_on_datasources_uid = ["prometheus"]
|
||||
},
|
||||
{
|
||||
dashboard_alert_group_name = "Blackbox Alerts"
|
||||
folder_uid = "blackbox"
|
||||
alert_definitions_path = "alerts/blackbox"
|
||||
dashboard_path_if_exist = ""
|
||||
keep_manual_changes = false
|
||||
prevent_destroy_on_recreate = false
|
||||
alerts_on_datasources_uid = ["prometheus"]
|
||||
}
|
||||
]
|
||||
# Data sources configuration
|
||||
|
||||
Reference in New Issue
Block a user