Add dashboard UID auto-generation and Gitea CI workflow

2026-03-25 06:41:19 +03:00
parent 345c5786b3
commit 558a23d916
83 changed files with 53372 additions and 1 deletions
--- a/environments/dev/Seahorse/alerts/self-monitoring/vmagent_persistent_queue_is_dropping_data.yaml
+++ b/environments/dev/Seahorse/alerts/self-monitoring/vmagent_persistent_queue_is_dropping_data.yaml
@ -0,0 +1,20 @@
+name: "Vmagent Persistent Queue Is Dropping Data"
+expression: |
+  sum(increase(vm_persistentqueue_bytes_dropped_total{job=~".*agent.*"}[5m])) without (path) > 0
+threshold: 0
+for: "10m"
+condition_type: "gt"
+need_reduce: true
+reducer_type: "sum"
+no_data_state: "OK"
+exec_err_state: "KeepLast"
+labels:
+  service: "vmagent"
+  severity: "critical"
+  status: "test"
+summary: |
+  Инстанс {{ $labels.instance }} сбрасывает данные из переполненного буфера.
+description: |
+  VMAgent-у на инстансе {{ $labels.instance }} пришлось сбросить данные из дискового буфера.
+  
+  **Влияние: ПРОИСХОДИТ АКТИВНАЯ ПОТЕРЯ МЕТРИК!** Дисковый буфер переполнен, и vmagent удаляет старые данные, чтобы освободить место для новых.
--- a/environments/dev/Seahorse/alerts/self-monitoring/vmcomponents_down.yaml
+++ b/environments/dev/Seahorse/alerts/self-monitoring/vmcomponents_down.yaml
@ -0,0 +1,20 @@
+name: "VictoriaMetrics components down"
+expression: |
+  up{job=~".*(agent|vminsert|vmselect|vmstorage|vmauth).*"} == 0
+threshold: 0
+for: "3m"
+condition_type: "eq"
+need_reduce: true
+reducer_type: "last"
+no_data_state: "OK"
+exec_err_state: "Error"
+labels:
+  service: "vmcomponents"
+  severity: "critical"
+  status: "test"
+summary: |
+  VictoriaMetrics компонент '{{ $labels.job }}' на инстансе {{ $labels.instance }} не отвечает.
+description: |
+  Компонент VictoriaMetrics '{{ $labels.job }}' на инстансе {{ $labels.instance }} перестал отвечать на запросы.
+
+  **Влияние**: Это критический компонент инфраструктуры мониторинга. Его отказ может привести к потере метрик, неработающим дашбордам или остановке системы алертинга.
--- a/environments/dev/Seahorse/alerts/self-monitoring/vmcomponents_warnings_error_logs.yaml
+++ b/environments/dev/Seahorse/alerts/self-monitoring/vmcomponents_warnings_error_logs.yaml
@ -0,0 +1,20 @@
+name: "VictoriaMetrics Too Many Warning or Error Logs"
+expression: |
+  sum(increase(vm_log_messages_total{level!="info", job=~".*(agent|vminsert|vmselect|vmstorage|vmauth).*"}[5m])) without (app_version, location, is_printed) > 35
+threshold: 40
+condition_type: "gt"
+for: "15m"
+need_reduce: true
+reducer_type: "last"
+no_data_state: "OK"
+exec_err_state: "Error"
+labels:
+  service: "vmcomponents"
+  severity: "warning"
+  status: "test"
+summary: |
+  Слишком много сообщений типа "error"/"warning" по {{ $labels.job }} от инстанса {{ $labels.instance }}.
+description: |
+  Компонент '{{ $labels.job }}' (инстанс {{ $labels.instance }}) генерирует слишком много логов уровня 'warning' или 'error'.
+  
+  **Влияние:** Это указывает на наличие скрытых проблем, которые могут привести к деградации производительности или будущим сбоям.
--- a/environments/dev/Seahorse/alerts/system/system_high_cpu_usage.yaml
+++ b/environments/dev/Seahorse/alerts/system/system_high_cpu_usage.yaml
@ -0,0 +1,28 @@
+name: "Высокая загрузка CPU"
+expression: |
+  (
+    1 - avg by (instance) (rate(node_cpu_seconds_total{job="self-monitoring", mode="idle"}[5m]))
+  ) * 100
+threshold: 90
+for: "5m"
+condition_type: "gt"
+need_reduce: true
+reducer_type: "max"
+no_data_state: "OK"
+exec_err_state: "Error"
+labels:
+  service: "system"
+  severity: "critical"
+  status: "test"
+summary: |
+  High CPU usage {{ printf "%.0f" $values.B.Value }}% on {{ $labels.instance }}
+description: |
+  На {{ $labels.instance }} высокая загрузка CPU (более порога в течение заданного времени).
+  Это сигнализирует о CPU-bound нагрузке, которая может повышать задержки и время ответа сервисов.
+
+  Что проверить:
+  1) top/htop: какие процессы потребляют CPU
+  2) mpstat -P ALL 1: распределение по ядрам и steal
+  3) run queue/load average (uptime, vmstat)
+  4) всплески трафика, cron/job-процессы, фоновые задачи
+  5) ошибки и таймауты приложений в логах
--- a/environments/dev/Seahorse/alerts/system/system_low_free_memory.yaml
+++ b/environments/dev/Seahorse/alerts/system/system_low_free_memory.yaml
@ -0,0 +1,26 @@
+name: "Мало свободной памяти"
+expression: |
+  (node_memory_MemAvailable_bytes{job="self-monitoring"} / node_memory_MemTotal_bytes{job="self-monitoring"}) * 100
+threshold: 5
+for: "5m"
+condition_type: "lt"
+need_reduce: true
+reducer_type: "min"
+no_data_state: "OK"
+exec_err_state: "Error"
+labels:
+  service: "system"
+  severity: "critical"
+  status: "test"
+summary: |
+  Low free memory {{ printf "%.0f" $values.B.Value }}% on {{ $labels.instance }}
+description: |
+  На {{ $labels.instance }} осталось очень мало доступной памяти.
+  Это сигнализирует о memory pressure, риске OOM kill и деградации производительности.
+
+  Что проверить:
+  1) free -h, vmstat 1, swapon -s
+  2) top/htop: процессы-лидеры по RSS/heap
+  3) OOM события в dmesg/journalctl
+  4) major page faults и IO wait (связанные алерты)
+  5) лимиты/requests (для k8s) и необходимость увеличения RAM
--- a/environments/dev/Seahorse/alerts/system/system_server_reboot.yaml
+++ b/environments/dev/Seahorse/alerts/system/system_server_reboot.yaml
@ -0,0 +1,26 @@
+name: "Обнаружена перезагрузка сервера"
+expression: |
+  changes(node_boot_time_seconds{job="self-monitoring"}[5m]) > bool 0
+threshold: 0
+for: "5m"
+condition_type: "gt"
+need_reduce: true
+reducer_type: "max"
+no_data_state: "OK"
+exec_err_state: "Error"
+labels:
+  service: "system"
+  severity: "warning"
+  status: "test"
+summary: |
+  Reboot detected on {{ $labels.instance }}
+description: |
+  На {{ $labels.instance }} обнаружен недавний перезапуск.
+  Это сигнализирует о возможном аварийном рестарте, плановых работах или проблемах питания/ядра.
+
+  Что проверить:
+  1) last reboot и uptime
+  2) journalctl -b -1 и kernel-логи до перезапуска
+  3) причины: OOM, kernel panic, watchdog, обновления
+  4) состояние сервисов после рестарта
+  5) повторяемость события и корреляцию с другими алертами
--- a/environments/dev/Seahorse/backend.tf
+++ b/environments/dev/Seahorse/backend.tf
@ -0,0 +1,24 @@
+terraform {
+  required_providers {
+    grafana = {
+      source  = "grafana/grafana"
+      version = ">= 4.7.0"
+    }
+    vault   = {
+      source  = "hashicorp/vault"
+    }
+  }
+
+  backend "s3" {
+    endpoints = {
+      s3 = "https://storage.yandexcloud.net" }
+    bucket                      = "monitoring-vcmt-core-deploy"
+    region                      = "ru-central1"
+    key                         = "dev-denis-practic/terraform.tfstate"
+    skip_region_validation      = true
+    skip_credentials_validation = true
+    skip_requesting_account_id  = true
+    skip_s3_checksum            = true
+    skip_metadata_api_check     = true
+  }
+}
--- a/environments/dev/Seahorse/dashboards/self-monitoring/vcmt-cluster.json
+++ b/environments/dev/Seahorse/dashboards/self-monitoring/vcmt-cluster.json
--- a/environments/dev/Seahorse/dashboards/self-monitoring/vcmt-vmagent.json
+++ b/environments/dev/Seahorse/dashboards/self-monitoring/vcmt-vmagent.json
--- a/environments/dev/Seahorse/dashboards/self-monitoring/vcmt-vmauth.json
+++ b/environments/dev/Seahorse/dashboards/self-monitoring/vcmt-vmauth.json
--- a/environments/dev/Seahorse/locals.tf
+++ b/environments/dev/Seahorse/locals.tf
@ -0,0 +1,50 @@
+locals {
+
+  # Определяем HTTP заголовки на основе значения disable_provenance
+  grafana_headers = {
+    "X-Disable-Provenance" = var.disable_provenance ? "true" : "false"
+  }
+
+# Contact points configuration
+contact_points = [
+{
+  name       = "default"
+  type       = "slack"
+  is_default = true
+  settings = {
+    webhook_url =  data.vault_kv_secret_v2.secret_ext.data["mmwebhook_default"]
+    template    = "../../../modules/grafana_contact_points/template/alerts_message_mm.template"
+  }
+},
+
+{
+  name       = "infra-alerts-critical"
+  type       = "slack"
+  is_default = false
+  settings = {
+    webhook_url =  data.vault_kv_secret_v2.secret_ext.data["mmwebhook_infra_alerts_critical"]
+    template    = "../../../modules/grafana_contact_points/template/alerts_message_mm.template"
+  }
+},
+
+{
+  name       = "infra-alerts-informational"
+  type       = "slack"
+  is_default = false
+  settings = {
+    webhook_url =  data.vault_kv_secret_v2.secret_ext.data["mmwebhook_infra_alerts_info"]
+    template    = "../../../modules/grafana_contact_points/template/alerts_message_mm.template"
+  }
+},
+
+{
+  name       = "infra-alerts-test"
+  type       = "slack"
+  is_default = false
+  settings = {
+    webhook_url =  data.vault_kv_secret_v2.secret_ext.data["mmwebhook_infra_alerts_test"]
+    template    = "../../../modules/grafana_contact_points/template/alerts_message_mm.template"
+  }
+} 
+]
+}
--- a/environments/dev/Seahorse/providers.tf
+++ b/environments/dev/Seahorse/providers.tf
@ -0,0 +1,21 @@
+provider "vault" { 
+  address          = "https://vault.pyn.ru"
+  skip_child_token = true
+}
+
+data "vault_kv_secret_v2" "secret_ext" { # Секреты для подключения к внешним источникам (mm, clickhouse и т.д.)
+ mount = "app"
+ name  = "groups/infraservice/monitoring/grafana/dev/ext"
+}
+
+data "vault_kv_secret_v2" "secret_int" { # Секреты для работы самой графаны
+  mount = "app"
+  name  = "groups/infraservice/monitoring/grafana/dev/int"
+}
+
+provider "grafana" {
+  alias        = "grafana01"
+  url          = "https://grafana-dev.hhmon.ru/"
+  auth         = data.vault_kv_secret_v2.secret_int.data["grafana_local_admin_password"]
+  http_headers = local.grafana_headers
+}
--- a/environments/dev/Seahorse/variables_alert.tf
+++ b/environments/dev/Seahorse/variables_alert.tf
@ -0,0 +1,54 @@
+# Alert duration and timing configuration
+variable "interval_seconds" {
+  description = "Interval in seconds for evaluating alerts"
+  type        = number
+  default     = 60
+}
+
+variable "default_interval_ms" {
+  description = "Default interval in milliseconds for evaluating alert expressions"
+  type        = number
+  default     = 60000
+}
+
+variable "default_max_data_points" {
+  description = "Default maximum number of data points"
+  type        = number
+  default     = 43200
+}
+
+variable "default_no_data_state" {
+  description = "Default no data state for alerts"
+  type        = string
+  default     = "OK"
+}
+
+variable "default_exec_err_state" {
+  description = "Default execution error state for alerts"
+  type        = string
+  default     = "Error"
+}
+
+variable "default_alert_duration" {
+  description = "Default duration (in seconds) for how long a condition must be true before alerting"
+  type        = number
+  default     = 300 # 5 minutes
+}
+
+variable "default_evaluation_interval" {
+  description = "Default interval (in seconds) between alert rule evaluations"
+  type        = number
+  default     = 60 # 1 minute
+}
+
+variable "default_time_range_from" {
+  description = "Default time range (in seconds) for main query lookback"
+  type        = number
+  default     = 604800 # 7 days
+}
+
+variable "default_processing_range" {
+  description = "Default time range (in seconds) for processing blocks"
+  type        = number
+  default     = 600 # 10 minutes
+}
--- a/environments/dev/Seahorse/variables_auth.tf
+++ b/environments/dev/Seahorse/variables_auth.tf
@ -0,0 +1,22 @@
+#variable "grafana_url" {
+#  description = "Grafana URL"
+#  type        = string
+#}
+
+#variable "grafana_auth" {
+#  description = "Grafana authentication token"
+#  type        = string
+#}
+
+variable "disable_provenance" {
+  description = "Controls whether Grafana provisioning is disabled"
+  type        = bool
+  default     = true
+}
+
+variable "env" {
+  description = "Grafana environment description"
+  type        = string
+}
+
+
--- a/environments/dev/Seahorse/variables_contact_points.tf
+++ b/environments/dev/Seahorse/variables_contact_points.tf
@ -0,0 +1,22 @@
+variable "contact_points" {
+  description = "List of contact points"
+  type = list(object({
+    name       = string
+    type       = string
+    is_default = optional(bool, false)
+    labels     = optional(map(string))
+    settings   = map(string)
+  }))
+  default = []
+}
+
+
+#output "contact_point_ids01" {
+#  value = module.grafana_contact_points01.contact_point_ids
+#}
+
+#output "contact_point_ids02" {
+#  value = module.grafana_contact_points02.contact_point_ids
+#}
+
+
--- a/environments/dev/Seahorse/variables_datasource.tf
+++ b/environments/dev/Seahorse/variables_datasource.tf
@ -0,0 +1,27 @@
+variable "datasources" {
+  description = "List of Grafana data sources"
+  type = list(object({
+    # Main parameters
+    name        = string                 # Data source name (displayed in Grafana)
+    uid         = string                 # Unique source identifier
+    type        = string                 # Data source type (e.g., prometheus, mysql, clickhouse)
+    url         = optional(string, null) # Connection URL (for most sources)
+    username    = optional(string, null)
+    access_mode = string # Access mode: proxy or direct
+    is_default  = bool   # Set as default source
+    
+    # Authentication settings
+    basic_auth          = optional(bool, false)  # Use basic authentication
+    basic_auth_user     = optional(string, null) # Username for basic authentication
+    basic_auth_password = optional(string, null) # Password for basic authentication
+
+    # Additional parameters
+    json_data        = optional(map(any), {})    # Additional parameters in JSON format
+    secure_json_data = optional(map(string), {}) # Sensitive data in JSON format
+
+    # Terraform lifecycle management fields
+    keep_manual_changes         = optional(bool, false) # Ignore manual changes in Grafana
+    prevent_destroy_on_recreate = optional(bool, false) # Prevent resource deletion on update
+  }))
+}
+
--- a/environments/dev/Seahorse/variables_notification_policies.tf
+++ b/environments/dev/Seahorse/variables_notification_policies.tf
@ -0,0 +1,30 @@
+variable "notification_policies" {
+  description = "Routing rules for specific label sets"
+  type = list(object({
+    contact_point   = string
+    continue        = optional(bool)
+    group_by        = optional(list(string))
+    group_wait      = optional(string)
+    group_interval  = optional(string)
+    repeat_interval = optional(string)
+    matchers = list(object({
+      label = string
+      match = string # Allowed operators are = for equality, != for negated equality, =~ for regex equality, and !~ for negated regex equality
+      value = string
+    }))
+    policies = optional(list(object({
+      contact_point   = string
+      continue        = optional(bool)
+      group_by        = optional(list(string))
+      group_wait      = optional(string)
+      group_interval  = optional(string)
+      repeat_interval = optional(string)
+      matchers = list(object({
+        label = string
+        match = string
+        value = string
+      }))
+    })), [])
+  }))
+  default = []
+}
--- a/environments/dev/Seahorse/variables_organization.tf
+++ b/environments/dev/Seahorse/variables_organization.tf
@ -0,0 +1,15 @@
+# Input variable for organizations at the environment level
+variable "organizations" {
+  description = "Grafana organization configuration"
+  type = list(object({
+    create_new_organization     = bool
+    keep_manual_changes         = bool
+    prevent_destroy_on_recreate = bool
+    organization_name           = string
+  }))
+}
+
+variable "org_id" {
+  description = "Grafana organization ID"
+  type        = string
+}