Add dashboard UID auto-generation and Gitea CI workflow
This commit is contained in:
@ -0,0 +1,31 @@
|
||||
name: "DEV ADIBROV - Low Disk Space (10%) - VCMT Nodes TEST ALLERT DEV"
|
||||
expression: |
|
||||
100 - (
|
||||
node_filesystem_avail_bytes{
|
||||
instance=~"ydx-.*:9100",
|
||||
mountpoint!~"^(/sys.*|/proc.*|/dev.*|/run.*|/boot.*)$",
|
||||
fstype=~"(zfs|xfs|ext.)"
|
||||
}
|
||||
* 100
|
||||
/
|
||||
node_filesystem_size_bytes{
|
||||
instance=~"ydx-.*:9100",
|
||||
mountpoint!~"^(/sys.*|/proc.*|/dev.*|/run.*|/boot.*)$",
|
||||
fstype=~"(zfs|xfs|ext.)"
|
||||
}
|
||||
)
|
||||
threshold: 90
|
||||
for: "1m"
|
||||
condition_type: "gt"
|
||||
need_reduce: true
|
||||
reducer_type: "min"
|
||||
no_data_state: "NoData"
|
||||
exec_err_state: "Error"
|
||||
labels:
|
||||
service: "system"
|
||||
severity: "critical"
|
||||
status: "test"
|
||||
summary: |
|
||||
{{ printf "%.0f" $values.B.Value }}% Usage on {{ $labels.mountpoint }} ({{ $labels.instance }})
|
||||
description: |
|
||||
ТЕСТОВЫЙ АЛЛЕРТ В ДЕВ КОНТУРЕ!!!! НЕ РЕАГИРОВАТЬ!!!!
|
||||
@ -0,0 +1,20 @@
|
||||
name: "DEV ADIBROV - Vmagent Persistent Queue Is Dropping DataTEST ALLERT DEV"
|
||||
expression: |
|
||||
sum(increase(vm_persistentqueue_bytes_dropped_total{job=~".*agent.*"}[5m])) without (path) > 0
|
||||
threshold: 0
|
||||
for: "10m"
|
||||
condition_type: "gt"
|
||||
need_reduce: true
|
||||
reducer_type: "sum"
|
||||
no_data_state: "OK"
|
||||
exec_err_state: "KeepLast"
|
||||
labels:
|
||||
service: "vmagent"
|
||||
severity: "critical"
|
||||
status: "test"
|
||||
summary: |
|
||||
Инстанс {{ $labels.instance }} сбрасывает данные из переполненного буфера.
|
||||
description: |
|
||||
VMAgent-у на инстансе {{ $labels.instance }} пришлось сбросить данные из дискового буфера.
|
||||
|
||||
**Влияние: ПРОИСХОДИТ АКТИВНАЯ ПОТЕРЯ МЕТРИК!** Дисковый буфер переполнен, и vmagent удаляет старые данные, чтобы освободить место для новых.
|
||||
@ -0,0 +1,20 @@
|
||||
name: "DEV ADIBROV - Vmagent Too Many Scrape ErrorsTEST ALLERT DEV"
|
||||
expression: |
|
||||
increase(vm_promscrape_scrapes_failed_total{job=~".*agent.*"}[5m]) > 35
|
||||
threshold: 40 # временный порог
|
||||
for: "15m"
|
||||
condition_type: "gt"
|
||||
need_reduce: false
|
||||
no_data_state: "OK"
|
||||
exec_err_state: "KeepLast"
|
||||
labels:
|
||||
service: "vmagent"
|
||||
severity: "warning"
|
||||
status: "test"
|
||||
summary: |
|
||||
Vmagent не может собрать один или несколько target'ов на инстансе {{ $labels.instance }}.
|
||||
description: |
|
||||
Job "{{ $labels.job }}" на инстансе {{ $labels.instance }} не может успешно скрапить target'ы в течение последних 15 минут.
|
||||
|
||||
**Влияние:** ПРОИСХОДИТ ПРЯМАЯ ПОТЕРЯ МЕТРИК ОТ ЦЕЛЕВОГО СЕРВИСА!
|
||||
Вы не получаете данные от одного или нескольких наблюдаемых сервисов. Дашборды и алерты, связанные с этими target'ами, будут показывать неполную или устаревшую информацию в мониторинге.
|
||||
@ -0,0 +1,29 @@
|
||||
name: "DEV ADIBROV - Критически мало места на диске (свободно 10%)TEST ALLERT DEV"
|
||||
expression: |
|
||||
(
|
||||
100
|
||||
- (
|
||||
node_filesystem_avail_bytes{job="self-monitoring", mountpoint!~"^(/sys|/proc|/dev|/run|/boot).*", fstype=~"(zfs|xfs|ext.)"}
|
||||
* 100
|
||||
/ node_filesystem_size_bytes{job="self-monitoring", mountpoint!~"^(/sys|/proc|/dev|/run|/boot).*", fstype=~"(zfs|xfs|ext.)"}
|
||||
)
|
||||
)
|
||||
threshold: 90
|
||||
for: "5m"
|
||||
condition_type: "gt"
|
||||
need_reduce: true
|
||||
reducer_type: "max"
|
||||
no_data_state: "OK"
|
||||
exec_err_state: "Error"
|
||||
labels:
|
||||
service: "system"
|
||||
severity: "critical"
|
||||
status: "test"
|
||||
summary: |
|
||||
Disk usage {{ printf "%.0f" $values.B.Value }}% on {{ $labels.mountpoint }} ({{ $labels.instance }})
|
||||
description: |
|
||||
На {{ $labels.mountpoint }} у {{ $labels.instance }} осталось менее 10% свободного места.
|
||||
Это сигнализирует о критически высоком риске остановки записи, сбоев сервисов и ошибок приложений.
|
||||
|
||||
Что проверить:
|
||||
1ТЕСТОВЫЙ АЛЛЕРТ В ДЕВ КОНТУРЕ!!!! НЕ РЕАГИРОВАТЬ!!!!
|
||||
@ -0,0 +1,29 @@
|
||||
name: "DEV ADIBROV - Мало места на диске (свободно 20%)TEST ALLERT DEV"
|
||||
expression: |
|
||||
(
|
||||
100
|
||||
- (
|
||||
node_filesystem_avail_bytes{job="self-monitoring", mountpoint!~"^(/sys|/proc|/dev|/run|/boot).*", fstype=~"(zfs|xfs|ext.)"}
|
||||
* 100
|
||||
/ node_filesystem_size_bytes{job="self-monitoring", mountpoint!~"^(/sys|/proc|/dev|/run|/boot).*", fstype=~"(zfs|xfs|ext.)"}
|
||||
)
|
||||
)
|
||||
threshold: 80
|
||||
for: "5m"
|
||||
condition_type: "gt"
|
||||
need_reduce: true
|
||||
reducer_type: "max"
|
||||
no_data_state: "OK"
|
||||
exec_err_state: "Error"
|
||||
labels:
|
||||
service: "system"
|
||||
severity: "warning"
|
||||
status: "test"
|
||||
summary: |
|
||||
Disk usage {{ printf "%.0f" $values.B.Value }}% on {{ $labels.mountpoint }} ({{ $labels.instance }})
|
||||
description: |
|
||||
На {{ $labels.mountpoint }} у {{ $labels.instance }} осталось менее 20% свободного места.
|
||||
Это сигнализирует о быстром приближении к исчерпанию места и риске деградации записи.
|
||||
|
||||
Что проверить:
|
||||
ТЕСТОВЫЙ АЛЛЕРТ В ДЕВ КОНТУРЕ!!!! НЕ РЕАГИРОВАТЬ!!!!
|
||||
@ -0,0 +1,29 @@
|
||||
name: "DEV ADIBROV - Мало места на диске (свободно 30%)TEST ALLERT DEV"
|
||||
expression: |
|
||||
(
|
||||
100
|
||||
- (
|
||||
node_filesystem_avail_bytes{job="self-monitoring", mountpoint!~"^(/sys|/proc|/dev|/run|/boot).*", fstype=~"(zfs|xfs|ext.)"}
|
||||
* 100
|
||||
/ node_filesystem_size_bytes{job="self-monitoring", mountpoint!~"^(/sys|/proc|/dev|/run|/boot).*", fstype=~"(zfs|xfs|ext.)"}
|
||||
)
|
||||
)
|
||||
threshold: 70
|
||||
for: "5m"
|
||||
condition_type: "gt"
|
||||
need_reduce: true
|
||||
reducer_type: "max"
|
||||
no_data_state: "OK"
|
||||
exec_err_state: "Error"
|
||||
labels:
|
||||
service: "system"
|
||||
severity: "warning"
|
||||
status: "test"
|
||||
summary: |
|
||||
Disk usage {{ printf "%.0f" $values.B.Value }}% on {{ $labels.mountpoint }} ({{ $labels.instance }})
|
||||
description: |
|
||||
На {{ $labels.mountpoint }} у {{ $labels.instance }} осталось менее 30% свободного места.
|
||||
Это сигнализирует о раннем риске заполнения диска и необходимости плановой очистки.
|
||||
|
||||
Что проверить:
|
||||
ТЕСТОВЫЙ АЛЛЕРТ В ДЕВ КОНТУРЕ!!!! НЕ РЕАГИРОВАТЬ!!!!
|
||||
24
environments/dev/adibrov/backend.tf
Normal file
24
environments/dev/adibrov/backend.tf
Normal file
@ -0,0 +1,24 @@
|
||||
terraform {
|
||||
required_providers {
|
||||
grafana = {
|
||||
source = "grafana/grafana"
|
||||
version = ">= 4.7.0"
|
||||
}
|
||||
vault = {
|
||||
source = "hashicorp/vault"
|
||||
}
|
||||
}
|
||||
|
||||
backend "s3" {
|
||||
endpoints = {
|
||||
s3 = "https://storage.yandexcloud.net" }
|
||||
bucket = "monitoring-vcmt-core-deploy"
|
||||
region = "ru-central1"
|
||||
key = "a.dibrov-practic/terraform.tfstate"
|
||||
skip_region_validation = true
|
||||
skip_credentials_validation = true
|
||||
skip_requesting_account_id = true
|
||||
skip_s3_checksum = true
|
||||
skip_metadata_api_check = true
|
||||
}
|
||||
}
|
||||
3675
environments/dev/adibrov/dashboards/self-monitoring/angie.json
Normal file
3675
environments/dev/adibrov/dashboards/self-monitoring/angie.json
Normal file
File diff suppressed because it is too large
Load Diff
23876
environments/dev/adibrov/dashboards/system/node_exporter_full.json
Normal file
23876
environments/dev/adibrov/dashboards/system/node_exporter_full.json
Normal file
File diff suppressed because it is too large
Load Diff
50
environments/dev/adibrov/locals.tf
Normal file
50
environments/dev/adibrov/locals.tf
Normal file
@ -0,0 +1,50 @@
|
||||
locals {
|
||||
|
||||
# Определяем HTTP заголовки на основе значения disable_provenance
|
||||
grafana_headers = {
|
||||
"X-Disable-Provenance" = var.disable_provenance ? "true" : "false"
|
||||
}
|
||||
|
||||
# Contact points configuration
|
||||
contact_points = [
|
||||
{
|
||||
name = "default"
|
||||
type = "slack"
|
||||
is_default = true
|
||||
settings = {
|
||||
webhook_url = data.vault_kv_secret_v2.secret_ext.data["mmwebhook_default"]
|
||||
template = "../../../modules/grafana_contact_points/template/alerts_message_mm.template"
|
||||
}
|
||||
},
|
||||
|
||||
{
|
||||
name = "infra-alerts-critical"
|
||||
type = "slack"
|
||||
is_default = false
|
||||
settings = {
|
||||
webhook_url = data.vault_kv_secret_v2.secret_ext.data["mmwebhook_infra_alerts_critical"]
|
||||
template = "../../../modules/grafana_contact_points/template/alerts_message_mm.template"
|
||||
}
|
||||
},
|
||||
|
||||
{
|
||||
name = "infra-alerts-informational"
|
||||
type = "slack"
|
||||
is_default = false
|
||||
settings = {
|
||||
webhook_url = data.vault_kv_secret_v2.secret_ext.data["mmwebhook_infra_alerts_info"]
|
||||
template = "../../../modules/grafana_contact_points/template/alerts_message_mm.template"
|
||||
}
|
||||
},
|
||||
|
||||
{
|
||||
name = "infra-alerts-test"
|
||||
type = "slack"
|
||||
is_default = false
|
||||
settings = {
|
||||
webhook_url = data.vault_kv_secret_v2.secret_ext.data["mmwebhook_infra_alerts_test"]
|
||||
template = "../../../modules/grafana_contact_points/template/alerts_message_mm.template"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
21
environments/dev/adibrov/providers.tf
Normal file
21
environments/dev/adibrov/providers.tf
Normal file
@ -0,0 +1,21 @@
|
||||
provider "vault" {
|
||||
address = "https://vault.pyn.ru"
|
||||
skip_child_token = true
|
||||
}
|
||||
|
||||
data "vault_kv_secret_v2" "secret_ext" { # Секреты для подключения к внешним источникам (mm, clickhouse и т.д.)
|
||||
mount = "app"
|
||||
name = "groups/infraservice/monitoring/grafana/dev/ext"
|
||||
}
|
||||
|
||||
data "vault_kv_secret_v2" "secret_int" { # Секреты для работы самой графаны
|
||||
mount = "app"
|
||||
name = "groups/infraservice/monitoring/grafana/dev/int"
|
||||
}
|
||||
|
||||
provider "grafana" {
|
||||
alias = "grafana01"
|
||||
url = "https://grafana-dev.hhmon.ru/"
|
||||
auth = data.vault_kv_secret_v2.secret_int.data["grafana_local_admin_password"]
|
||||
http_headers = local.grafana_headers
|
||||
}
|
||||
54
environments/dev/adibrov/variables_alert.tf
Normal file
54
environments/dev/adibrov/variables_alert.tf
Normal file
@ -0,0 +1,54 @@
|
||||
# Alert duration and timing configuration
|
||||
variable "interval_seconds" {
|
||||
description = "Interval in seconds for evaluating alerts"
|
||||
type = number
|
||||
default = 60
|
||||
}
|
||||
|
||||
variable "default_interval_ms" {
|
||||
description = "Default interval in milliseconds for evaluating alert expressions"
|
||||
type = number
|
||||
default = 60000
|
||||
}
|
||||
|
||||
variable "default_max_data_points" {
|
||||
description = "Default maximum number of data points"
|
||||
type = number
|
||||
default = 43200
|
||||
}
|
||||
|
||||
variable "default_no_data_state" {
|
||||
description = "Default no data state for alerts"
|
||||
type = string
|
||||
default = "OK"
|
||||
}
|
||||
|
||||
variable "default_exec_err_state" {
|
||||
description = "Default execution error state for alerts"
|
||||
type = string
|
||||
default = "Error"
|
||||
}
|
||||
|
||||
variable "default_alert_duration" {
|
||||
description = "Default duration (in seconds) for how long a condition must be true before alerting"
|
||||
type = number
|
||||
default = 300 # 5 minutes
|
||||
}
|
||||
|
||||
variable "default_evaluation_interval" {
|
||||
description = "Default interval (in seconds) between alert rule evaluations"
|
||||
type = number
|
||||
default = 60 # 1 minute
|
||||
}
|
||||
|
||||
variable "default_time_range_from" {
|
||||
description = "Default time range (in seconds) for main query lookback"
|
||||
type = number
|
||||
default = 604800 # 7 days
|
||||
}
|
||||
|
||||
variable "default_processing_range" {
|
||||
description = "Default time range (in seconds) for processing blocks"
|
||||
type = number
|
||||
default = 600 # 10 minutes
|
||||
}
|
||||
22
environments/dev/adibrov/variables_auth.tf
Normal file
22
environments/dev/adibrov/variables_auth.tf
Normal file
@ -0,0 +1,22 @@
|
||||
#variable "grafana_url" {
|
||||
# description = "Grafana URL"
|
||||
# type = string
|
||||
#}
|
||||
|
||||
#variable "grafana_auth" {
|
||||
# description = "Grafana authentication token"
|
||||
# type = string
|
||||
#}
|
||||
|
||||
variable "disable_provenance" {
|
||||
description = "Controls whether Grafana provisioning is disabled"
|
||||
type = bool
|
||||
default = true
|
||||
}
|
||||
|
||||
variable "env" {
|
||||
description = "Grafana environment description"
|
||||
type = string
|
||||
}
|
||||
|
||||
|
||||
22
environments/dev/adibrov/variables_contact_points.tf
Normal file
22
environments/dev/adibrov/variables_contact_points.tf
Normal file
@ -0,0 +1,22 @@
|
||||
variable "contact_points" {
|
||||
description = "List of contact points"
|
||||
type = list(object({
|
||||
name = string
|
||||
type = string
|
||||
is_default = optional(bool, false)
|
||||
labels = optional(map(string))
|
||||
settings = map(string)
|
||||
}))
|
||||
default = []
|
||||
}
|
||||
|
||||
|
||||
#output "contact_point_ids01" {
|
||||
# value = module.grafana_contact_points01.contact_point_ids
|
||||
#}
|
||||
|
||||
#output "contact_point_ids02" {
|
||||
# value = module.grafana_contact_points02.contact_point_ids
|
||||
#}
|
||||
|
||||
|
||||
27
environments/dev/adibrov/variables_datasource.tf
Normal file
27
environments/dev/adibrov/variables_datasource.tf
Normal file
@ -0,0 +1,27 @@
|
||||
variable "datasources" {
|
||||
description = "List of Grafana data sources"
|
||||
type = list(object({
|
||||
# Main parameters
|
||||
name = string # Data source name (displayed in Grafana)
|
||||
uid = string # Unique source identifier
|
||||
type = string # Data source type (e.g., prometheus, mysql, clickhouse)
|
||||
url = optional(string, null) # Connection URL (for most sources)
|
||||
username = optional(string, null)
|
||||
access_mode = string # Access mode: proxy or direct
|
||||
is_default = bool # Set as default source
|
||||
|
||||
# Authentication settings
|
||||
basic_auth = optional(bool, false) # Use basic authentication
|
||||
basic_auth_user = optional(string, null) # Username for basic authentication
|
||||
basic_auth_password = optional(string, null) # Password for basic authentication
|
||||
|
||||
# Additional parameters
|
||||
json_data = optional(map(any), {}) # Additional parameters in JSON format
|
||||
secure_json_data = optional(map(string), {}) # Sensitive data in JSON format
|
||||
|
||||
# Terraform lifecycle management fields
|
||||
keep_manual_changes = optional(bool, false) # Ignore manual changes in Grafana
|
||||
prevent_destroy_on_recreate = optional(bool, false) # Prevent resource deletion on update
|
||||
}))
|
||||
}
|
||||
|
||||
30
environments/dev/adibrov/variables_notification_policies.tf
Normal file
30
environments/dev/adibrov/variables_notification_policies.tf
Normal file
@ -0,0 +1,30 @@
|
||||
variable "notification_policies" {
|
||||
description = "Routing rules for specific label sets"
|
||||
type = list(object({
|
||||
contact_point = string
|
||||
continue = optional(bool)
|
||||
group_by = optional(list(string))
|
||||
group_wait = optional(string)
|
||||
group_interval = optional(string)
|
||||
repeat_interval = optional(string)
|
||||
matchers = list(object({
|
||||
label = string
|
||||
match = string # Allowed operators are = for equality, != for negated equality, =~ for regex equality, and !~ for negated regex equality
|
||||
value = string
|
||||
}))
|
||||
policies = optional(list(object({
|
||||
contact_point = string
|
||||
continue = optional(bool)
|
||||
group_by = optional(list(string))
|
||||
group_wait = optional(string)
|
||||
group_interval = optional(string)
|
||||
repeat_interval = optional(string)
|
||||
matchers = list(object({
|
||||
label = string
|
||||
match = string
|
||||
value = string
|
||||
}))
|
||||
})), [])
|
||||
}))
|
||||
default = []
|
||||
}
|
||||
15
environments/dev/adibrov/variables_organization.tf
Normal file
15
environments/dev/adibrov/variables_organization.tf
Normal file
@ -0,0 +1,15 @@
|
||||
# Input variable for organizations at the environment level
|
||||
variable "organizations" {
|
||||
description = "Grafana organization configuration"
|
||||
type = list(object({
|
||||
create_new_organization = bool
|
||||
keep_manual_changes = bool
|
||||
prevent_destroy_on_recreate = bool
|
||||
organization_name = string
|
||||
}))
|
||||
}
|
||||
|
||||
variable "org_id" {
|
||||
description = "Grafana organization ID"
|
||||
type = string
|
||||
}
|
||||
Reference in New Issue
Block a user