Add dashboard UID auto-generation and Gitea CI workflow
Some checks failed
terraform-dev / validate (push) Failing after 1m53s
terraform-dev / plan (push) Has been skipped
terraform-dev / apply (push) Has been skipped

This commit is contained in:
Alexandr
2026-03-25 06:41:19 +03:00
parent 345c5786b3
commit 558a23d916
83 changed files with 53372 additions and 1 deletions

View File

@ -0,0 +1,31 @@
name: "DEV ADIBROV - Low Disk Space (10%) - VCMT Nodes TEST ALLERT DEV"
expression: |
100 - (
node_filesystem_avail_bytes{
instance=~"ydx-.*:9100",
mountpoint!~"^(/sys.*|/proc.*|/dev.*|/run.*|/boot.*)$",
fstype=~"(zfs|xfs|ext.)"
}
* 100
/
node_filesystem_size_bytes{
instance=~"ydx-.*:9100",
mountpoint!~"^(/sys.*|/proc.*|/dev.*|/run.*|/boot.*)$",
fstype=~"(zfs|xfs|ext.)"
}
)
threshold: 90
for: "1m"
condition_type: "gt"
need_reduce: true
reducer_type: "min"
no_data_state: "NoData"
exec_err_state: "Error"
labels:
service: "system"
severity: "critical"
status: "test"
summary: |
{{ printf "%.0f" $values.B.Value }}% Usage on {{ $labels.mountpoint }} ({{ $labels.instance }})
description: |
ТЕСТОВЫЙ АЛЛЕРТ В ДЕВ КОНТУРЕ!!!! НЕ РЕАГИРОВАТЬ!!!!

View File

@ -0,0 +1,20 @@
name: "DEV ADIBROV - Vmagent Persistent Queue Is Dropping DataTEST ALLERT DEV"
expression: |
sum(increase(vm_persistentqueue_bytes_dropped_total{job=~".*agent.*"}[5m])) without (path) > 0
threshold: 0
for: "10m"
condition_type: "gt"
need_reduce: true
reducer_type: "sum"
no_data_state: "OK"
exec_err_state: "KeepLast"
labels:
service: "vmagent"
severity: "critical"
status: "test"
summary: |
Инстанс {{ $labels.instance }} сбрасывает данные из переполненного буфера.
description: |
VMAgent-у на инстансе {{ $labels.instance }} пришлось сбросить данные из дискового буфера.
**Влияние: ПРОИСХОДИТ АКТИВНАЯ ПОТЕРЯ МЕТРИК!** Дисковый буфер переполнен, и vmagent удаляет старые данные, чтобы освободить место для новых.

View File

@ -0,0 +1,20 @@
name: "DEV ADIBROV - Vmagent Too Many Scrape ErrorsTEST ALLERT DEV"
expression: |
increase(vm_promscrape_scrapes_failed_total{job=~".*agent.*"}[5m]) > 35
threshold: 40 # временный порог
for: "15m"
condition_type: "gt"
need_reduce: false
no_data_state: "OK"
exec_err_state: "KeepLast"
labels:
service: "vmagent"
severity: "warning"
status: "test"
summary: |
Vmagent не может собрать один или несколько target'ов на инстансе {{ $labels.instance }}.
description: |
Job "{{ $labels.job }}" на инстансе {{ $labels.instance }} не может успешно скрапить target'ы в течение последних 15 минут.
**Влияние:** ПРОИСХОДИТ ПРЯМАЯ ПОТЕРЯ МЕТРИК ОТ ЦЕЛЕВОГО СЕРВИСА!
Вы не получаете данные от одного или нескольких наблюдаемых сервисов. Дашборды и алерты, связанные с этими target'ами, будут показывать неполную или устаревшую информацию в мониторинге.

View File

@ -0,0 +1,29 @@
name: "DEV ADIBROV - Критически мало места на диске (свободно 10%)TEST ALLERT DEV"
expression: |
(
100
- (
node_filesystem_avail_bytes{job="self-monitoring", mountpoint!~"^(/sys|/proc|/dev|/run|/boot).*", fstype=~"(zfs|xfs|ext.)"}
* 100
/ node_filesystem_size_bytes{job="self-monitoring", mountpoint!~"^(/sys|/proc|/dev|/run|/boot).*", fstype=~"(zfs|xfs|ext.)"}
)
)
threshold: 90
for: "5m"
condition_type: "gt"
need_reduce: true
reducer_type: "max"
no_data_state: "OK"
exec_err_state: "Error"
labels:
service: "system"
severity: "critical"
status: "test"
summary: |
Disk usage {{ printf "%.0f" $values.B.Value }}% on {{ $labels.mountpoint }} ({{ $labels.instance }})
description: |
На {{ $labels.mountpoint }} у {{ $labels.instance }} осталось менее 10% свободного места.
Это сигнализирует о критически высоком риске остановки записи, сбоев сервисов и ошибок приложений.
Что проверить:
1ТЕСТОВЫЙ АЛЛЕРТ В ДЕВ КОНТУРЕ!!!! НЕ РЕАГИРОВАТЬ!!!!

View File

@ -0,0 +1,29 @@
name: "DEV ADIBROV - Мало места на диске (свободно 20%)TEST ALLERT DEV"
expression: |
(
100
- (
node_filesystem_avail_bytes{job="self-monitoring", mountpoint!~"^(/sys|/proc|/dev|/run|/boot).*", fstype=~"(zfs|xfs|ext.)"}
* 100
/ node_filesystem_size_bytes{job="self-monitoring", mountpoint!~"^(/sys|/proc|/dev|/run|/boot).*", fstype=~"(zfs|xfs|ext.)"}
)
)
threshold: 80
for: "5m"
condition_type: "gt"
need_reduce: true
reducer_type: "max"
no_data_state: "OK"
exec_err_state: "Error"
labels:
service: "system"
severity: "warning"
status: "test"
summary: |
Disk usage {{ printf "%.0f" $values.B.Value }}% on {{ $labels.mountpoint }} ({{ $labels.instance }})
description: |
На {{ $labels.mountpoint }} у {{ $labels.instance }} осталось менее 20% свободного места.
Это сигнализирует о быстром приближении к исчерпанию места и риске деградации записи.
Что проверить:
ТЕСТОВЫЙ АЛЛЕРТ В ДЕВ КОНТУРЕ!!!! НЕ РЕАГИРОВАТЬ!!!!

View File

@ -0,0 +1,29 @@
name: "DEV ADIBROV - Мало места на диске (свободно 30%)TEST ALLERT DEV"
expression: |
(
100
- (
node_filesystem_avail_bytes{job="self-monitoring", mountpoint!~"^(/sys|/proc|/dev|/run|/boot).*", fstype=~"(zfs|xfs|ext.)"}
* 100
/ node_filesystem_size_bytes{job="self-monitoring", mountpoint!~"^(/sys|/proc|/dev|/run|/boot).*", fstype=~"(zfs|xfs|ext.)"}
)
)
threshold: 70
for: "5m"
condition_type: "gt"
need_reduce: true
reducer_type: "max"
no_data_state: "OK"
exec_err_state: "Error"
labels:
service: "system"
severity: "warning"
status: "test"
summary: |
Disk usage {{ printf "%.0f" $values.B.Value }}% on {{ $labels.mountpoint }} ({{ $labels.instance }})
description: |
На {{ $labels.mountpoint }} у {{ $labels.instance }} осталось менее 30% свободного места.
Это сигнализирует о раннем риске заполнения диска и необходимости плановой очистки.
Что проверить:
ТЕСТОВЫЙ АЛЛЕРТ В ДЕВ КОНТУРЕ!!!! НЕ РЕАГИРОВАТЬ!!!!

View File

@ -0,0 +1,24 @@
terraform {
required_providers {
grafana = {
source = "grafana/grafana"
version = ">= 4.7.0"
}
vault = {
source = "hashicorp/vault"
}
}
backend "s3" {
endpoints = {
s3 = "https://storage.yandexcloud.net" }
bucket = "monitoring-vcmt-core-deploy"
region = "ru-central1"
key = "a.dibrov-practic/terraform.tfstate"
skip_region_validation = true
skip_credentials_validation = true
skip_requesting_account_id = true
skip_s3_checksum = true
skip_metadata_api_check = true
}
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,50 @@
locals {
# Определяем HTTP заголовки на основе значения disable_provenance
grafana_headers = {
"X-Disable-Provenance" = var.disable_provenance ? "true" : "false"
}
# Contact points configuration
contact_points = [
{
name = "default"
type = "slack"
is_default = true
settings = {
webhook_url = data.vault_kv_secret_v2.secret_ext.data["mmwebhook_default"]
template = "../../../modules/grafana_contact_points/template/alerts_message_mm.template"
}
},
{
name = "infra-alerts-critical"
type = "slack"
is_default = false
settings = {
webhook_url = data.vault_kv_secret_v2.secret_ext.data["mmwebhook_infra_alerts_critical"]
template = "../../../modules/grafana_contact_points/template/alerts_message_mm.template"
}
},
{
name = "infra-alerts-informational"
type = "slack"
is_default = false
settings = {
webhook_url = data.vault_kv_secret_v2.secret_ext.data["mmwebhook_infra_alerts_info"]
template = "../../../modules/grafana_contact_points/template/alerts_message_mm.template"
}
},
{
name = "infra-alerts-test"
type = "slack"
is_default = false
settings = {
webhook_url = data.vault_kv_secret_v2.secret_ext.data["mmwebhook_infra_alerts_test"]
template = "../../../modules/grafana_contact_points/template/alerts_message_mm.template"
}
}
]
}

View File

@ -0,0 +1,21 @@
provider "vault" {
address = "https://vault.pyn.ru"
skip_child_token = true
}
data "vault_kv_secret_v2" "secret_ext" { # Секреты для подключения к внешним источникам (mm, clickhouse и т.д.)
mount = "app"
name = "groups/infraservice/monitoring/grafana/dev/ext"
}
data "vault_kv_secret_v2" "secret_int" { # Секреты для работы самой графаны
mount = "app"
name = "groups/infraservice/monitoring/grafana/dev/int"
}
provider "grafana" {
alias = "grafana01"
url = "https://grafana-dev.hhmon.ru/"
auth = data.vault_kv_secret_v2.secret_int.data["grafana_local_admin_password"]
http_headers = local.grafana_headers
}

View File

@ -0,0 +1,54 @@
# Alert duration and timing configuration
variable "interval_seconds" {
description = "Interval in seconds for evaluating alerts"
type = number
default = 60
}
variable "default_interval_ms" {
description = "Default interval in milliseconds for evaluating alert expressions"
type = number
default = 60000
}
variable "default_max_data_points" {
description = "Default maximum number of data points"
type = number
default = 43200
}
variable "default_no_data_state" {
description = "Default no data state for alerts"
type = string
default = "OK"
}
variable "default_exec_err_state" {
description = "Default execution error state for alerts"
type = string
default = "Error"
}
variable "default_alert_duration" {
description = "Default duration (in seconds) for how long a condition must be true before alerting"
type = number
default = 300 # 5 minutes
}
variable "default_evaluation_interval" {
description = "Default interval (in seconds) between alert rule evaluations"
type = number
default = 60 # 1 minute
}
variable "default_time_range_from" {
description = "Default time range (in seconds) for main query lookback"
type = number
default = 604800 # 7 days
}
variable "default_processing_range" {
description = "Default time range (in seconds) for processing blocks"
type = number
default = 600 # 10 minutes
}

View File

@ -0,0 +1,22 @@
#variable "grafana_url" {
# description = "Grafana URL"
# type = string
#}
#variable "grafana_auth" {
# description = "Grafana authentication token"
# type = string
#}
variable "disable_provenance" {
description = "Controls whether Grafana provisioning is disabled"
type = bool
default = true
}
variable "env" {
description = "Grafana environment description"
type = string
}

View File

@ -0,0 +1,22 @@
variable "contact_points" {
description = "List of contact points"
type = list(object({
name = string
type = string
is_default = optional(bool, false)
labels = optional(map(string))
settings = map(string)
}))
default = []
}
#output "contact_point_ids01" {
# value = module.grafana_contact_points01.contact_point_ids
#}
#output "contact_point_ids02" {
# value = module.grafana_contact_points02.contact_point_ids
#}

View File

@ -0,0 +1,27 @@
variable "datasources" {
description = "List of Grafana data sources"
type = list(object({
# Main parameters
name = string # Data source name (displayed in Grafana)
uid = string # Unique source identifier
type = string # Data source type (e.g., prometheus, mysql, clickhouse)
url = optional(string, null) # Connection URL (for most sources)
username = optional(string, null)
access_mode = string # Access mode: proxy or direct
is_default = bool # Set as default source
# Authentication settings
basic_auth = optional(bool, false) # Use basic authentication
basic_auth_user = optional(string, null) # Username for basic authentication
basic_auth_password = optional(string, null) # Password for basic authentication
# Additional parameters
json_data = optional(map(any), {}) # Additional parameters in JSON format
secure_json_data = optional(map(string), {}) # Sensitive data in JSON format
# Terraform lifecycle management fields
keep_manual_changes = optional(bool, false) # Ignore manual changes in Grafana
prevent_destroy_on_recreate = optional(bool, false) # Prevent resource deletion on update
}))
}

View File

@ -0,0 +1,30 @@
variable "notification_policies" {
description = "Routing rules for specific label sets"
type = list(object({
contact_point = string
continue = optional(bool)
group_by = optional(list(string))
group_wait = optional(string)
group_interval = optional(string)
repeat_interval = optional(string)
matchers = list(object({
label = string
match = string # Allowed operators are = for equality, != for negated equality, =~ for regex equality, and !~ for negated regex equality
value = string
}))
policies = optional(list(object({
contact_point = string
continue = optional(bool)
group_by = optional(list(string))
group_wait = optional(string)
group_interval = optional(string)
repeat_interval = optional(string)
matchers = list(object({
label = string
match = string
value = string
}))
})), [])
}))
default = []
}

View File

@ -0,0 +1,15 @@
# Input variable for organizations at the environment level
variable "organizations" {
description = "Grafana organization configuration"
type = list(object({
create_new_organization = bool
keep_manual_changes = bool
prevent_destroy_on_recreate = bool
organization_name = string
}))
}
variable "org_id" {
description = "Grafana organization ID"
type = string
}