feat: add postgres/gitea/blackbox alerts and more node alerts
This commit is contained in:
19
environments/dev/adibrov/alerts/postgres/postgres_down.yaml
Normal file
19
environments/dev/adibrov/alerts/postgres/postgres_down.yaml
Normal file
@ -0,0 +1,19 @@
|
||||
name: "DEV ADIBROV - PostgreSQL Down"
|
||||
expression: |
|
||||
pg_up{job="postgres"}
|
||||
threshold: 1
|
||||
for: "2m"
|
||||
condition_type: "lt"
|
||||
need_reduce: true
|
||||
reducer_type: "min"
|
||||
no_data_state: "Alerting"
|
||||
exec_err_state: "Alerting"
|
||||
labels:
|
||||
service: "postgres"
|
||||
severity: "critical"
|
||||
team: "infra"
|
||||
summary: |
|
||||
PostgreSQL недоступен на {{ $labels.instance }}
|
||||
description: |
|
||||
Exporter не может подключиться к PostgreSQL на {{ $labels.instance }}.
|
||||
База данных либо упала, либо недоступна по сети.
|
||||
@ -0,0 +1,21 @@
|
||||
name: "DEV ADIBROV - PostgreSQL Long Running Transactions"
|
||||
expression: |
|
||||
max by(instance) (
|
||||
pg_stat_activity_max_tx_duration{job="postgres", state="active"}
|
||||
)
|
||||
threshold: 300
|
||||
for: "5m"
|
||||
condition_type: "gt"
|
||||
need_reduce: true
|
||||
reducer_type: "max"
|
||||
no_data_state: "OK"
|
||||
exec_err_state: "Error"
|
||||
labels:
|
||||
service: "postgres"
|
||||
severity: "warning"
|
||||
team: "infra"
|
||||
summary: |
|
||||
Долгая транзакция в PostgreSQL на {{ $labels.instance }}: {{ printf "%.0f" $values.B.Value }}с
|
||||
description: |
|
||||
На {{ $labels.instance }} есть транзакция, выполняющаяся более 5 минут.
|
||||
Долгие транзакции блокируют vacuum и могут накапливать bloat.
|
||||
@ -0,0 +1,22 @@
|
||||
name: "DEV ADIBROV - PostgreSQL Too Many Connections"
|
||||
expression: |
|
||||
(
|
||||
pg_stat_activity_count{job="postgres"}
|
||||
/ pg_settings_max_connections{job="postgres"}
|
||||
) * 100
|
||||
threshold: 80
|
||||
for: "5m"
|
||||
condition_type: "gt"
|
||||
need_reduce: true
|
||||
reducer_type: "max"
|
||||
no_data_state: "OK"
|
||||
exec_err_state: "Error"
|
||||
labels:
|
||||
service: "postgres"
|
||||
severity: "warning"
|
||||
team: "infra"
|
||||
summary: |
|
||||
PostgreSQL: {{ printf "%.0f" $values.B.Value }}% соединений занято на {{ $labels.instance }}
|
||||
description: |
|
||||
На {{ $labels.instance }} занято {{ printf "%.0f" $values.B.Value }}% от max_connections PostgreSQL.
|
||||
При достижении лимита новые подключения будут отклоняться с ошибкой.
|
||||
Reference in New Issue
Block a user