Add dashboard UID auto-generation and Gitea CI workflow
This commit is contained in:
@ -0,0 +1,20 @@
|
||||
name: "Vmagent Persistent Queue Is Dropping Data"
|
||||
expression: |
|
||||
sum(increase(vm_persistentqueue_bytes_dropped_total{job=~".*agent.*"}[5m])) without (path) > 0
|
||||
threshold: 0
|
||||
for: "10m"
|
||||
condition_type: "gt"
|
||||
need_reduce: true
|
||||
reducer_type: "sum"
|
||||
no_data_state: "OK"
|
||||
exec_err_state: "KeepLast"
|
||||
labels:
|
||||
service: "vmagent"
|
||||
severity: "critical"
|
||||
status: "test"
|
||||
summary: |
|
||||
Инстанс {{ $labels.instance }} сбрасывает данные из переполненного буфера.
|
||||
description: |
|
||||
VMAgent-у на инстансе {{ $labels.instance }} пришлось сбросить данные из дискового буфера.
|
||||
|
||||
**Влияние: ПРОИСХОДИТ АКТИВНАЯ ПОТЕРЯ МЕТРИК!** Дисковый буфер переполнен, и vmagent удаляет старые данные, чтобы освободить место для новых.
|
||||
@ -0,0 +1,20 @@
|
||||
name: "VictoriaMetrics components down"
|
||||
expression: |
|
||||
up{job=~".*(agent|vminsert|vmselect|vmstorage|vmauth).*"} == 0
|
||||
threshold: 0
|
||||
for: "3m"
|
||||
condition_type: "eq"
|
||||
need_reduce: true
|
||||
reducer_type: "last"
|
||||
no_data_state: "OK"
|
||||
exec_err_state: "Error"
|
||||
labels:
|
||||
service: "vmcomponents"
|
||||
severity: "critical"
|
||||
status: "test"
|
||||
summary: |
|
||||
VictoriaMetrics компонент '{{ $labels.job }}' на инстансе {{ $labels.instance }} не отвечает.
|
||||
description: |
|
||||
Компонент VictoriaMetrics '{{ $labels.job }}' на инстансе {{ $labels.instance }} перестал отвечать на запросы.
|
||||
|
||||
**Влияние**: Это критический компонент инфраструктуры мониторинга. Его отказ может привести к потере метрик, неработающим дашбордам или остановке системы алертинга.
|
||||
@ -0,0 +1,20 @@
|
||||
name: "VictoriaMetrics Too Many Warning or Error Logs"
|
||||
expression: |
|
||||
sum(increase(vm_log_messages_total{level!="info", job=~".*(agent|vminsert|vmselect|vmstorage|vmauth).*"}[5m])) without (app_version, location, is_printed) > 35
|
||||
threshold: 40
|
||||
condition_type: "gt"
|
||||
for: "15m"
|
||||
need_reduce: true
|
||||
reducer_type: "last"
|
||||
no_data_state: "OK"
|
||||
exec_err_state: "Error"
|
||||
labels:
|
||||
service: "vmcomponents"
|
||||
severity: "warning"
|
||||
status: "test"
|
||||
summary: |
|
||||
Слишком много сообщений типа "error"/"warning" по {{ $labels.job }} от инстанса {{ $labels.instance }}.
|
||||
description: |
|
||||
Компонент '{{ $labels.job }}' (инстанс {{ $labels.instance }}) генерирует слишком много логов уровня 'warning' или 'error'.
|
||||
|
||||
**Влияние:** Это указывает на наличие скрытых проблем, которые могут привести к деградации производительности или будущим сбоям.
|
||||
@ -0,0 +1,28 @@
|
||||
name: "Высокая загрузка CPU"
|
||||
expression: |
|
||||
(
|
||||
1 - avg by (instance) (rate(node_cpu_seconds_total{job="self-monitoring", mode="idle"}[5m]))
|
||||
) * 100
|
||||
threshold: 90
|
||||
for: "5m"
|
||||
condition_type: "gt"
|
||||
need_reduce: true
|
||||
reducer_type: "max"
|
||||
no_data_state: "OK"
|
||||
exec_err_state: "Error"
|
||||
labels:
|
||||
service: "system"
|
||||
severity: "critical"
|
||||
status: "test"
|
||||
summary: |
|
||||
High CPU usage {{ printf "%.0f" $values.B.Value }}% on {{ $labels.instance }}
|
||||
description: |
|
||||
На {{ $labels.instance }} высокая загрузка CPU (более порога в течение заданного времени).
|
||||
Это сигнализирует о CPU-bound нагрузке, которая может повышать задержки и время ответа сервисов.
|
||||
|
||||
Что проверить:
|
||||
1) top/htop: какие процессы потребляют CPU
|
||||
2) mpstat -P ALL 1: распределение по ядрам и steal
|
||||
3) run queue/load average (uptime, vmstat)
|
||||
4) всплески трафика, cron/job-процессы, фоновые задачи
|
||||
5) ошибки и таймауты приложений в логах
|
||||
@ -0,0 +1,26 @@
|
||||
name: "Мало свободной памяти"
|
||||
expression: |
|
||||
(node_memory_MemAvailable_bytes{job="self-monitoring"} / node_memory_MemTotal_bytes{job="self-monitoring"}) * 100
|
||||
threshold: 5
|
||||
for: "5m"
|
||||
condition_type: "lt"
|
||||
need_reduce: true
|
||||
reducer_type: "min"
|
||||
no_data_state: "OK"
|
||||
exec_err_state: "Error"
|
||||
labels:
|
||||
service: "system"
|
||||
severity: "critical"
|
||||
status: "test"
|
||||
summary: |
|
||||
Low free memory {{ printf "%.0f" $values.B.Value }}% on {{ $labels.instance }}
|
||||
description: |
|
||||
На {{ $labels.instance }} осталось очень мало доступной памяти.
|
||||
Это сигнализирует о memory pressure, риске OOM kill и деградации производительности.
|
||||
|
||||
Что проверить:
|
||||
1) free -h, vmstat 1, swapon -s
|
||||
2) top/htop: процессы-лидеры по RSS/heap
|
||||
3) OOM события в dmesg/journalctl
|
||||
4) major page faults и IO wait (связанные алерты)
|
||||
5) лимиты/requests (для k8s) и необходимость увеличения RAM
|
||||
@ -0,0 +1,26 @@
|
||||
name: "Обнаружена перезагрузка сервера"
|
||||
expression: |
|
||||
changes(node_boot_time_seconds{job="self-monitoring"}[5m]) > bool 0
|
||||
threshold: 0
|
||||
for: "5m"
|
||||
condition_type: "gt"
|
||||
need_reduce: true
|
||||
reducer_type: "max"
|
||||
no_data_state: "OK"
|
||||
exec_err_state: "Error"
|
||||
labels:
|
||||
service: "system"
|
||||
severity: "warning"
|
||||
status: "test"
|
||||
summary: |
|
||||
Reboot detected on {{ $labels.instance }}
|
||||
description: |
|
||||
На {{ $labels.instance }} обнаружен недавний перезапуск.
|
||||
Это сигнализирует о возможном аварийном рестарте, плановых работах или проблемах питания/ядра.
|
||||
|
||||
Что проверить:
|
||||
1) last reboot и uptime
|
||||
2) journalctl -b -1 и kernel-логи до перезапуска
|
||||
3) причины: OOM, kernel panic, watchdog, обновления
|
||||
4) состояние сервисов после рестарта
|
||||
5) повторяемость события и корреляцию с другими алертами
|
||||
24
environments/dev/Seahorse/backend.tf
Normal file
24
environments/dev/Seahorse/backend.tf
Normal file
@ -0,0 +1,24 @@
|
||||
terraform {
|
||||
required_providers {
|
||||
grafana = {
|
||||
source = "grafana/grafana"
|
||||
version = ">= 4.7.0"
|
||||
}
|
||||
vault = {
|
||||
source = "hashicorp/vault"
|
||||
}
|
||||
}
|
||||
|
||||
backend "s3" {
|
||||
endpoints = {
|
||||
s3 = "https://storage.yandexcloud.net" }
|
||||
bucket = "monitoring-vcmt-core-deploy"
|
||||
region = "ru-central1"
|
||||
key = "dev-denis-practic/terraform.tfstate"
|
||||
skip_region_validation = true
|
||||
skip_credentials_validation = true
|
||||
skip_requesting_account_id = true
|
||||
skip_s3_checksum = true
|
||||
skip_metadata_api_check = true
|
||||
}
|
||||
}
|
||||
11452
environments/dev/Seahorse/dashboards/self-monitoring/vcmt-cluster.json
Normal file
11452
environments/dev/Seahorse/dashboards/self-monitoring/vcmt-cluster.json
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
50
environments/dev/Seahorse/locals.tf
Normal file
50
environments/dev/Seahorse/locals.tf
Normal file
@ -0,0 +1,50 @@
|
||||
locals {
|
||||
|
||||
# Определяем HTTP заголовки на основе значения disable_provenance
|
||||
grafana_headers = {
|
||||
"X-Disable-Provenance" = var.disable_provenance ? "true" : "false"
|
||||
}
|
||||
|
||||
# Contact points configuration
|
||||
contact_points = [
|
||||
{
|
||||
name = "default"
|
||||
type = "slack"
|
||||
is_default = true
|
||||
settings = {
|
||||
webhook_url = data.vault_kv_secret_v2.secret_ext.data["mmwebhook_default"]
|
||||
template = "../../../modules/grafana_contact_points/template/alerts_message_mm.template"
|
||||
}
|
||||
},
|
||||
|
||||
{
|
||||
name = "infra-alerts-critical"
|
||||
type = "slack"
|
||||
is_default = false
|
||||
settings = {
|
||||
webhook_url = data.vault_kv_secret_v2.secret_ext.data["mmwebhook_infra_alerts_critical"]
|
||||
template = "../../../modules/grafana_contact_points/template/alerts_message_mm.template"
|
||||
}
|
||||
},
|
||||
|
||||
{
|
||||
name = "infra-alerts-informational"
|
||||
type = "slack"
|
||||
is_default = false
|
||||
settings = {
|
||||
webhook_url = data.vault_kv_secret_v2.secret_ext.data["mmwebhook_infra_alerts_info"]
|
||||
template = "../../../modules/grafana_contact_points/template/alerts_message_mm.template"
|
||||
}
|
||||
},
|
||||
|
||||
{
|
||||
name = "infra-alerts-test"
|
||||
type = "slack"
|
||||
is_default = false
|
||||
settings = {
|
||||
webhook_url = data.vault_kv_secret_v2.secret_ext.data["mmwebhook_infra_alerts_test"]
|
||||
template = "../../../modules/grafana_contact_points/template/alerts_message_mm.template"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
21
environments/dev/Seahorse/providers.tf
Normal file
21
environments/dev/Seahorse/providers.tf
Normal file
@ -0,0 +1,21 @@
|
||||
provider "vault" {
|
||||
address = "https://vault.pyn.ru"
|
||||
skip_child_token = true
|
||||
}
|
||||
|
||||
data "vault_kv_secret_v2" "secret_ext" { # Секреты для подключения к внешним источникам (mm, clickhouse и т.д.)
|
||||
mount = "app"
|
||||
name = "groups/infraservice/monitoring/grafana/dev/ext"
|
||||
}
|
||||
|
||||
data "vault_kv_secret_v2" "secret_int" { # Секреты для работы самой графаны
|
||||
mount = "app"
|
||||
name = "groups/infraservice/monitoring/grafana/dev/int"
|
||||
}
|
||||
|
||||
provider "grafana" {
|
||||
alias = "grafana01"
|
||||
url = "https://grafana-dev.hhmon.ru/"
|
||||
auth = data.vault_kv_secret_v2.secret_int.data["grafana_local_admin_password"]
|
||||
http_headers = local.grafana_headers
|
||||
}
|
||||
54
environments/dev/Seahorse/variables_alert.tf
Normal file
54
environments/dev/Seahorse/variables_alert.tf
Normal file
@ -0,0 +1,54 @@
|
||||
# Alert duration and timing configuration
|
||||
variable "interval_seconds" {
|
||||
description = "Interval in seconds for evaluating alerts"
|
||||
type = number
|
||||
default = 60
|
||||
}
|
||||
|
||||
variable "default_interval_ms" {
|
||||
description = "Default interval in milliseconds for evaluating alert expressions"
|
||||
type = number
|
||||
default = 60000
|
||||
}
|
||||
|
||||
variable "default_max_data_points" {
|
||||
description = "Default maximum number of data points"
|
||||
type = number
|
||||
default = 43200
|
||||
}
|
||||
|
||||
variable "default_no_data_state" {
|
||||
description = "Default no data state for alerts"
|
||||
type = string
|
||||
default = "OK"
|
||||
}
|
||||
|
||||
variable "default_exec_err_state" {
|
||||
description = "Default execution error state for alerts"
|
||||
type = string
|
||||
default = "Error"
|
||||
}
|
||||
|
||||
variable "default_alert_duration" {
|
||||
description = "Default duration (in seconds) for how long a condition must be true before alerting"
|
||||
type = number
|
||||
default = 300 # 5 minutes
|
||||
}
|
||||
|
||||
variable "default_evaluation_interval" {
|
||||
description = "Default interval (in seconds) between alert rule evaluations"
|
||||
type = number
|
||||
default = 60 # 1 minute
|
||||
}
|
||||
|
||||
variable "default_time_range_from" {
|
||||
description = "Default time range (in seconds) for main query lookback"
|
||||
type = number
|
||||
default = 604800 # 7 days
|
||||
}
|
||||
|
||||
variable "default_processing_range" {
|
||||
description = "Default time range (in seconds) for processing blocks"
|
||||
type = number
|
||||
default = 600 # 10 minutes
|
||||
}
|
||||
22
environments/dev/Seahorse/variables_auth.tf
Normal file
22
environments/dev/Seahorse/variables_auth.tf
Normal file
@ -0,0 +1,22 @@
|
||||
#variable "grafana_url" {
|
||||
# description = "Grafana URL"
|
||||
# type = string
|
||||
#}
|
||||
|
||||
#variable "grafana_auth" {
|
||||
# description = "Grafana authentication token"
|
||||
# type = string
|
||||
#}
|
||||
|
||||
variable "disable_provenance" {
|
||||
description = "Controls whether Grafana provisioning is disabled"
|
||||
type = bool
|
||||
default = true
|
||||
}
|
||||
|
||||
variable "env" {
|
||||
description = "Grafana environment description"
|
||||
type = string
|
||||
}
|
||||
|
||||
|
||||
22
environments/dev/Seahorse/variables_contact_points.tf
Normal file
22
environments/dev/Seahorse/variables_contact_points.tf
Normal file
@ -0,0 +1,22 @@
|
||||
variable "contact_points" {
|
||||
description = "List of contact points"
|
||||
type = list(object({
|
||||
name = string
|
||||
type = string
|
||||
is_default = optional(bool, false)
|
||||
labels = optional(map(string))
|
||||
settings = map(string)
|
||||
}))
|
||||
default = []
|
||||
}
|
||||
|
||||
|
||||
#output "contact_point_ids01" {
|
||||
# value = module.grafana_contact_points01.contact_point_ids
|
||||
#}
|
||||
|
||||
#output "contact_point_ids02" {
|
||||
# value = module.grafana_contact_points02.contact_point_ids
|
||||
#}
|
||||
|
||||
|
||||
27
environments/dev/Seahorse/variables_datasource.tf
Normal file
27
environments/dev/Seahorse/variables_datasource.tf
Normal file
@ -0,0 +1,27 @@
|
||||
variable "datasources" {
|
||||
description = "List of Grafana data sources"
|
||||
type = list(object({
|
||||
# Main parameters
|
||||
name = string # Data source name (displayed in Grafana)
|
||||
uid = string # Unique source identifier
|
||||
type = string # Data source type (e.g., prometheus, mysql, clickhouse)
|
||||
url = optional(string, null) # Connection URL (for most sources)
|
||||
username = optional(string, null)
|
||||
access_mode = string # Access mode: proxy or direct
|
||||
is_default = bool # Set as default source
|
||||
|
||||
# Authentication settings
|
||||
basic_auth = optional(bool, false) # Use basic authentication
|
||||
basic_auth_user = optional(string, null) # Username for basic authentication
|
||||
basic_auth_password = optional(string, null) # Password for basic authentication
|
||||
|
||||
# Additional parameters
|
||||
json_data = optional(map(any), {}) # Additional parameters in JSON format
|
||||
secure_json_data = optional(map(string), {}) # Sensitive data in JSON format
|
||||
|
||||
# Terraform lifecycle management fields
|
||||
keep_manual_changes = optional(bool, false) # Ignore manual changes in Grafana
|
||||
prevent_destroy_on_recreate = optional(bool, false) # Prevent resource deletion on update
|
||||
}))
|
||||
}
|
||||
|
||||
30
environments/dev/Seahorse/variables_notification_policies.tf
Normal file
30
environments/dev/Seahorse/variables_notification_policies.tf
Normal file
@ -0,0 +1,30 @@
|
||||
variable "notification_policies" {
|
||||
description = "Routing rules for specific label sets"
|
||||
type = list(object({
|
||||
contact_point = string
|
||||
continue = optional(bool)
|
||||
group_by = optional(list(string))
|
||||
group_wait = optional(string)
|
||||
group_interval = optional(string)
|
||||
repeat_interval = optional(string)
|
||||
matchers = list(object({
|
||||
label = string
|
||||
match = string # Allowed operators are = for equality, != for negated equality, =~ for regex equality, and !~ for negated regex equality
|
||||
value = string
|
||||
}))
|
||||
policies = optional(list(object({
|
||||
contact_point = string
|
||||
continue = optional(bool)
|
||||
group_by = optional(list(string))
|
||||
group_wait = optional(string)
|
||||
group_interval = optional(string)
|
||||
repeat_interval = optional(string)
|
||||
matchers = list(object({
|
||||
label = string
|
||||
match = string
|
||||
value = string
|
||||
}))
|
||||
})), [])
|
||||
}))
|
||||
default = []
|
||||
}
|
||||
15
environments/dev/Seahorse/variables_organization.tf
Normal file
15
environments/dev/Seahorse/variables_organization.tf
Normal file
@ -0,0 +1,15 @@
|
||||
# Input variable for organizations at the environment level
|
||||
variable "organizations" {
|
||||
description = "Grafana organization configuration"
|
||||
type = list(object({
|
||||
create_new_organization = bool
|
||||
keep_manual_changes = bool
|
||||
prevent_destroy_on_recreate = bool
|
||||
organization_name = string
|
||||
}))
|
||||
}
|
||||
|
||||
variable "org_id" {
|
||||
description = "Grafana organization ID"
|
||||
type = string
|
||||
}
|
||||
Reference in New Issue
Block a user