Add dashboard UID auto-generation and Gitea CI workflow

2026-03-25 06:41:19 +03:00
parent 345c5786b3
commit 558a23d916
83 changed files with 53372 additions and 1 deletions
--- a/environments/dev/adibrov/alerts/self-monitoring/disk_space_low_vcmt_nodes.yaml
+++ b/environments/dev/adibrov/alerts/self-monitoring/disk_space_low_vcmt_nodes.yaml
@ -0,0 +1,31 @@
+name: "DEV ADIBROV - Low Disk Space (10%) - VCMT Nodes TEST ALLERT DEV"
+expression: |
+   100 - (
+      node_filesystem_avail_bytes{
+        instance=~"ydx-.*:9100",
+        mountpoint!~"^(/sys.*|/proc.*|/dev.*|/run.*|/boot.*)$",
+        fstype=~"(zfs|xfs|ext.)"
+      }
+      * 100
+      /
+      node_filesystem_size_bytes{
+        instance=~"ydx-.*:9100",
+        mountpoint!~"^(/sys.*|/proc.*|/dev.*|/run.*|/boot.*)$",
+        fstype=~"(zfs|xfs|ext.)"
+      }
+      )
+threshold: 90
+for: "1m"
+condition_type: "gt"
+need_reduce: true
+reducer_type: "min"
+no_data_state: "NoData"
+exec_err_state: "Error"
+labels:
+  service: "system"
+  severity: "critical"
+  status: "test"
+summary: |
+  {{ printf "%.0f" $values.B.Value }}% Usage on {{ $labels.mountpoint }} ({{ $labels.instance }})
+description: |
+  ТЕСТОВЫЙ АЛЛЕРТ В ДЕВ КОНТУРЕ!!!! НЕ РЕАГИРОВАТЬ!!!!
--- a/environments/dev/adibrov/alerts/self-monitoring/vmagent_persistent_queue_is_dropping_data.yaml
+++ b/environments/dev/adibrov/alerts/self-monitoring/vmagent_persistent_queue_is_dropping_data.yaml
@ -0,0 +1,20 @@
+name: "DEV ADIBROV - Vmagent Persistent Queue Is Dropping DataTEST ALLERT DEV"
+expression: |
+  sum(increase(vm_persistentqueue_bytes_dropped_total{job=~".*agent.*"}[5m])) without (path) > 0
+threshold: 0
+for: "10m"
+condition_type: "gt"
+need_reduce: true
+reducer_type: "sum"
+no_data_state: "OK"
+exec_err_state: "KeepLast"
+labels:
+  service: "vmagent"
+  severity: "critical"
+  status: "test"
+summary: |
+  Инстанс {{ $labels.instance }} сбрасывает данные из переполненного буфера.
+description: |
+  VMAgent-у на инстансе {{ $labels.instance }} пришлось сбросить данные из дискового буфера.
+  
+  **Влияние: ПРОИСХОДИТ АКТИВНАЯ ПОТЕРЯ МЕТРИК!** Дисковый буфер переполнен, и vmagent удаляет старые данные, чтобы освободить место для новых.
--- a/environments/dev/adibrov/alerts/self-monitoring/vmagent_too_many_scrape_errors.yaml
+++ b/environments/dev/adibrov/alerts/self-monitoring/vmagent_too_many_scrape_errors.yaml
@ -0,0 +1,20 @@
+name: "DEV ADIBROV - Vmagent Too Many Scrape ErrorsTEST ALLERT DEV"
+expression: |
+  increase(vm_promscrape_scrapes_failed_total{job=~".*agent.*"}[5m]) > 35
+threshold: 40 # временный порог
+for: "15m"
+condition_type: "gt"
+need_reduce: false
+no_data_state: "OK"
+exec_err_state: "KeepLast"
+labels:
+  service: "vmagent"
+  severity: "warning"
+  status: "test"
+summary: |
+  Vmagent не может собрать один или несколько target'ов на инстансе {{ $labels.instance }}.
+description: |
+  Job "{{ $labels.job }}" на инстансе {{ $labels.instance }} не может успешно скрапить target'ы в течение последних 15 минут.
+  
+  **Влияние:** ПРОИСХОДИТ ПРЯМАЯ ПОТЕРЯ МЕТРИК ОТ ЦЕЛЕВОГО СЕРВИСА!
+  Вы не получаете данные от одного или нескольких наблюдаемых сервисов. Дашборды и алерты, связанные с этими target'ами, будут показывать неполную или устаревшую информацию в мониторинге.