Add dashboard UID auto-generation and Gitea CI workflow
Some checks failed
terraform-dev / validate (push) Failing after 1m53s
terraform-dev / plan (push) Has been skipped
terraform-dev / apply (push) Has been skipped

This commit is contained in:
Alexandr
2026-03-25 06:41:19 +03:00
parent 345c5786b3
commit 558a23d916
83 changed files with 53372 additions and 1 deletions

View File

@ -0,0 +1,20 @@
name: "Vmagent Persistent Queue Is Dropping Data"
expression: |
sum(increase(vm_persistentqueue_bytes_dropped_total{job=~".*agent.*"}[5m])) without (path) > 0
threshold: 0
for: "10m"
condition_type: "gt"
need_reduce: true
reducer_type: "sum"
no_data_state: "OK"
exec_err_state: "KeepLast"
labels:
service: "vmagent"
severity: "critical"
status: "test"
summary: |
Инстанс {{ $labels.instance }} сбрасывает данные из переполненного буфера.
description: |
VMAgent-у на инстансе {{ $labels.instance }} пришлось сбросить данные из дискового буфера.
**Влияние: ПРОИСХОДИТ АКТИВНАЯ ПОТЕРЯ МЕТРИК!** Дисковый буфер переполнен, и vmagent удаляет старые данные, чтобы освободить место для новых.

View File

@ -0,0 +1,20 @@
name: "VictoriaMetrics components down"
expression: |
up{job=~".*(agent|vminsert|vmselect|vmstorage|vmauth).*"} == 0
threshold: 0
for: "3m"
condition_type: "eq"
need_reduce: true
reducer_type: "last"
no_data_state: "OK"
exec_err_state: "Error"
labels:
service: "vmcomponents"
severity: "critical"
status: "test"
summary: |
VictoriaMetrics компонент '{{ $labels.job }}' на инстансе {{ $labels.instance }} не отвечает.
description: |
Компонент VictoriaMetrics '{{ $labels.job }}' на инстансе {{ $labels.instance }} перестал отвечать на запросы.
**Влияние**: Это критический компонент инфраструктуры мониторинга. Его отказ может привести к потере метрик, неработающим дашбордам или остановке системы алертинга.

View File

@ -0,0 +1,20 @@
name: "VictoriaMetrics Too Many Warning or Error Logs"
expression: |
sum(increase(vm_log_messages_total{level!="info", job=~".*(agent|vminsert|vmselect|vmstorage|vmauth).*"}[5m])) without (app_version, location, is_printed) > 35
threshold: 40
condition_type: "gt"
for: "15m"
need_reduce: true
reducer_type: "last"
no_data_state: "OK"
exec_err_state: "Error"
labels:
service: "vmcomponents"
severity: "warning"
status: "test"
summary: |
Слишком много сообщений типа "error"/"warning" по {{ $labels.job }} от инстанса {{ $labels.instance }}.
description: |
Компонент '{{ $labels.job }}' (инстанс {{ $labels.instance }}) генерирует слишком много логов уровня 'warning' или 'error'.
**Влияние:** Это указывает на наличие скрытых проблем, которые могут привести к деградации производительности или будущим сбоям.

View File

@ -0,0 +1,28 @@
name: "Высокая загрузка CPU"
expression: |
(
1 - avg by (instance) (rate(node_cpu_seconds_total{job="self-monitoring", mode="idle"}[5m]))
) * 100
threshold: 90
for: "5m"
condition_type: "gt"
need_reduce: true
reducer_type: "max"
no_data_state: "OK"
exec_err_state: "Error"
labels:
service: "system"
severity: "critical"
status: "test"
summary: |
High CPU usage {{ printf "%.0f" $values.B.Value }}% on {{ $labels.instance }}
description: |
На {{ $labels.instance }} высокая загрузка CPU (более порога в течение заданного времени).
Это сигнализирует о CPU-bound нагрузке, которая может повышать задержки и время ответа сервисов.
Что проверить:
1) top/htop: какие процессы потребляют CPU
2) mpstat -P ALL 1: распределение по ядрам и steal
3) run queue/load average (uptime, vmstat)
4) всплески трафика, cron/job-процессы, фоновые задачи
5) ошибки и таймауты приложений в логах

View File

@ -0,0 +1,26 @@
name: "Мало свободной памяти"
expression: |
(node_memory_MemAvailable_bytes{job="self-monitoring"} / node_memory_MemTotal_bytes{job="self-monitoring"}) * 100
threshold: 5
for: "5m"
condition_type: "lt"
need_reduce: true
reducer_type: "min"
no_data_state: "OK"
exec_err_state: "Error"
labels:
service: "system"
severity: "critical"
status: "test"
summary: |
Low free memory {{ printf "%.0f" $values.B.Value }}% on {{ $labels.instance }}
description: |
На {{ $labels.instance }} осталось очень мало доступной памяти.
Это сигнализирует о memory pressure, риске OOM kill и деградации производительности.
Что проверить:
1) free -h, vmstat 1, swapon -s
2) top/htop: процессы-лидеры по RSS/heap
3) OOM события в dmesg/journalctl
4) major page faults и IO wait (связанные алерты)
5) лимиты/requests (для k8s) и необходимость увеличения RAM

View File

@ -0,0 +1,26 @@
name: "Обнаружена перезагрузка сервера"
expression: |
changes(node_boot_time_seconds{job="self-monitoring"}[5m]) > bool 0
threshold: 0
for: "5m"
condition_type: "gt"
need_reduce: true
reducer_type: "max"
no_data_state: "OK"
exec_err_state: "Error"
labels:
service: "system"
severity: "warning"
status: "test"
summary: |
Reboot detected on {{ $labels.instance }}
description: |
На {{ $labels.instance }} обнаружен недавний перезапуск.
Это сигнализирует о возможном аварийном рестарте, плановых работах или проблемах питания/ядра.
Что проверить:
1) last reboot и uptime
2) journalctl -b -1 и kernel-логи до перезапуска
3) причины: OOM, kernel panic, watchdog, обновления
4) состояние сервисов после рестарта
5) повторяемость события и корреляцию с другими алертами

View File

@ -0,0 +1,24 @@
terraform {
required_providers {
grafana = {
source = "grafana/grafana"
version = ">= 4.7.0"
}
vault = {
source = "hashicorp/vault"
}
}
backend "s3" {
endpoints = {
s3 = "https://storage.yandexcloud.net" }
bucket = "monitoring-vcmt-core-deploy"
region = "ru-central1"
key = "dev-denis-practic/terraform.tfstate"
skip_region_validation = true
skip_credentials_validation = true
skip_requesting_account_id = true
skip_s3_checksum = true
skip_metadata_api_check = true
}
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,50 @@
locals {
# Определяем HTTP заголовки на основе значения disable_provenance
grafana_headers = {
"X-Disable-Provenance" = var.disable_provenance ? "true" : "false"
}
# Contact points configuration
contact_points = [
{
name = "default"
type = "slack"
is_default = true
settings = {
webhook_url = data.vault_kv_secret_v2.secret_ext.data["mmwebhook_default"]
template = "../../../modules/grafana_contact_points/template/alerts_message_mm.template"
}
},
{
name = "infra-alerts-critical"
type = "slack"
is_default = false
settings = {
webhook_url = data.vault_kv_secret_v2.secret_ext.data["mmwebhook_infra_alerts_critical"]
template = "../../../modules/grafana_contact_points/template/alerts_message_mm.template"
}
},
{
name = "infra-alerts-informational"
type = "slack"
is_default = false
settings = {
webhook_url = data.vault_kv_secret_v2.secret_ext.data["mmwebhook_infra_alerts_info"]
template = "../../../modules/grafana_contact_points/template/alerts_message_mm.template"
}
},
{
name = "infra-alerts-test"
type = "slack"
is_default = false
settings = {
webhook_url = data.vault_kv_secret_v2.secret_ext.data["mmwebhook_infra_alerts_test"]
template = "../../../modules/grafana_contact_points/template/alerts_message_mm.template"
}
}
]
}

View File

@ -0,0 +1,21 @@
provider "vault" {
address = "https://vault.pyn.ru"
skip_child_token = true
}
data "vault_kv_secret_v2" "secret_ext" { # Секреты для подключения к внешним источникам (mm, clickhouse и т.д.)
mount = "app"
name = "groups/infraservice/monitoring/grafana/dev/ext"
}
data "vault_kv_secret_v2" "secret_int" { # Секреты для работы самой графаны
mount = "app"
name = "groups/infraservice/monitoring/grafana/dev/int"
}
provider "grafana" {
alias = "grafana01"
url = "https://grafana-dev.hhmon.ru/"
auth = data.vault_kv_secret_v2.secret_int.data["grafana_local_admin_password"]
http_headers = local.grafana_headers
}

View File

@ -0,0 +1,54 @@
# Alert duration and timing configuration
variable "interval_seconds" {
description = "Interval in seconds for evaluating alerts"
type = number
default = 60
}
variable "default_interval_ms" {
description = "Default interval in milliseconds for evaluating alert expressions"
type = number
default = 60000
}
variable "default_max_data_points" {
description = "Default maximum number of data points"
type = number
default = 43200
}
variable "default_no_data_state" {
description = "Default no data state for alerts"
type = string
default = "OK"
}
variable "default_exec_err_state" {
description = "Default execution error state for alerts"
type = string
default = "Error"
}
variable "default_alert_duration" {
description = "Default duration (in seconds) for how long a condition must be true before alerting"
type = number
default = 300 # 5 minutes
}
variable "default_evaluation_interval" {
description = "Default interval (in seconds) between alert rule evaluations"
type = number
default = 60 # 1 minute
}
variable "default_time_range_from" {
description = "Default time range (in seconds) for main query lookback"
type = number
default = 604800 # 7 days
}
variable "default_processing_range" {
description = "Default time range (in seconds) for processing blocks"
type = number
default = 600 # 10 minutes
}

View File

@ -0,0 +1,22 @@
#variable "grafana_url" {
# description = "Grafana URL"
# type = string
#}
#variable "grafana_auth" {
# description = "Grafana authentication token"
# type = string
#}
variable "disable_provenance" {
description = "Controls whether Grafana provisioning is disabled"
type = bool
default = true
}
variable "env" {
description = "Grafana environment description"
type = string
}

View File

@ -0,0 +1,22 @@
variable "contact_points" {
description = "List of contact points"
type = list(object({
name = string
type = string
is_default = optional(bool, false)
labels = optional(map(string))
settings = map(string)
}))
default = []
}
#output "contact_point_ids01" {
# value = module.grafana_contact_points01.contact_point_ids
#}
#output "contact_point_ids02" {
# value = module.grafana_contact_points02.contact_point_ids
#}

View File

@ -0,0 +1,27 @@
variable "datasources" {
description = "List of Grafana data sources"
type = list(object({
# Main parameters
name = string # Data source name (displayed in Grafana)
uid = string # Unique source identifier
type = string # Data source type (e.g., prometheus, mysql, clickhouse)
url = optional(string, null) # Connection URL (for most sources)
username = optional(string, null)
access_mode = string # Access mode: proxy or direct
is_default = bool # Set as default source
# Authentication settings
basic_auth = optional(bool, false) # Use basic authentication
basic_auth_user = optional(string, null) # Username for basic authentication
basic_auth_password = optional(string, null) # Password for basic authentication
# Additional parameters
json_data = optional(map(any), {}) # Additional parameters in JSON format
secure_json_data = optional(map(string), {}) # Sensitive data in JSON format
# Terraform lifecycle management fields
keep_manual_changes = optional(bool, false) # Ignore manual changes in Grafana
prevent_destroy_on_recreate = optional(bool, false) # Prevent resource deletion on update
}))
}

View File

@ -0,0 +1,30 @@
variable "notification_policies" {
description = "Routing rules for specific label sets"
type = list(object({
contact_point = string
continue = optional(bool)
group_by = optional(list(string))
group_wait = optional(string)
group_interval = optional(string)
repeat_interval = optional(string)
matchers = list(object({
label = string
match = string # Allowed operators are = for equality, != for negated equality, =~ for regex equality, and !~ for negated regex equality
value = string
}))
policies = optional(list(object({
contact_point = string
continue = optional(bool)
group_by = optional(list(string))
group_wait = optional(string)
group_interval = optional(string)
repeat_interval = optional(string)
matchers = list(object({
label = string
match = string
value = string
}))
})), [])
}))
default = []
}

View File

@ -0,0 +1,15 @@
# Input variable for organizations at the environment level
variable "organizations" {
description = "Grafana organization configuration"
type = list(object({
create_new_organization = bool
keep_manual_changes = bool
prevent_destroy_on_recreate = bool
organization_name = string
}))
}
variable "org_id" {
description = "Grafana organization ID"
type = string
}