Add dashboard UID auto-generation and Gitea CI workflow
This commit is contained in:
18
environments/README — копия.md
Normal file
18
environments/README — копия.md
Normal file
@ -0,0 +1,18 @@
|
||||
# grafana-terraform
|
||||
Конфигурация Grafana
|
||||
## Интеграция с Hashicorp Vault
|
||||
В кластере **vault.pyn.ru** создана **approle infraservice-iac** с правами на чтение секретов из хранилища `app/data/groups/infraservice/*`.
|
||||
Все члены группы **/vault-infraservice** могут посмотреть role-id с помощью `vault read auth/approle/role/infraservice-iac/role-id`, а также создать secret-id командой cli `vault write -f auth/approle/role/infraservice-iac/secret-id`.
|
||||
Для успешной авторизации в Vault перед запуском нужно установить переменную окружения `VAULT_TOKEN` с токеном авторизации в значении.
|
||||
## Запуск
|
||||
Запуск осуществляется из директории окружения, например `environments/test`. Перед запуском необходимо задать переменные окружения из файла с переменными `.env` со следующим содержимым:
|
||||
```bash
|
||||
set -a
|
||||
# ключи для s3 backend'ов можно найти в Vault
|
||||
AWS_ACCESS_KEY_ID="<ключ для доступа к s3 бэкэнду>"
|
||||
AWS_SECRET_ACCESS_KEY="<секретный ключ для доступа к s3 бэкэнду>"
|
||||
VAULT_ROLE_ID="<role_id для approle vault>"
|
||||
VAULT_SECRET_ID="<secret_id для approle vault>"
|
||||
VAULT_TOKEN=$(curl -s -X POST -d "{\"role_id\":\"$VAULT_ROLE_ID\",\"secret_id\":\"$VAULT_SECRET_ID\"}" \
|
||||
https://vault.pyn.ru/v1/auth/approle/login | jq -r .auth.client_token)
|
||||
```
|
||||
@ -0,0 +1,20 @@
|
||||
name: "Vmagent Persistent Queue Is Dropping Data"
|
||||
expression: |
|
||||
sum(increase(vm_persistentqueue_bytes_dropped_total{job=~".*agent.*"}[5m])) without (path) > 0
|
||||
threshold: 0
|
||||
for: "10m"
|
||||
condition_type: "gt"
|
||||
need_reduce: true
|
||||
reducer_type: "sum"
|
||||
no_data_state: "OK"
|
||||
exec_err_state: "KeepLast"
|
||||
labels:
|
||||
service: "vmagent"
|
||||
severity: "critical"
|
||||
status: "test"
|
||||
summary: |
|
||||
Инстанс {{ $labels.instance }} сбрасывает данные из переполненного буфера.
|
||||
description: |
|
||||
VMAgent-у на инстансе {{ $labels.instance }} пришлось сбросить данные из дискового буфера.
|
||||
|
||||
**Влияние: ПРОИСХОДИТ АКТИВНАЯ ПОТЕРЯ МЕТРИК!** Дисковый буфер переполнен, и vmagent удаляет старые данные, чтобы освободить место для новых.
|
||||
@ -0,0 +1,20 @@
|
||||
name: "VictoriaMetrics components down"
|
||||
expression: |
|
||||
up{job=~".*(agent|vminsert|vmselect|vmstorage|vmauth).*"} == 0
|
||||
threshold: 0
|
||||
for: "3m"
|
||||
condition_type: "eq"
|
||||
need_reduce: true
|
||||
reducer_type: "last"
|
||||
no_data_state: "OK"
|
||||
exec_err_state: "Error"
|
||||
labels:
|
||||
service: "vmcomponents"
|
||||
severity: "critical"
|
||||
status: "test"
|
||||
summary: |
|
||||
VictoriaMetrics компонент '{{ $labels.job }}' на инстансе {{ $labels.instance }} не отвечает.
|
||||
description: |
|
||||
Компонент VictoriaMetrics '{{ $labels.job }}' на инстансе {{ $labels.instance }} перестал отвечать на запросы.
|
||||
|
||||
**Влияние**: Это критический компонент инфраструктуры мониторинга. Его отказ может привести к потере метрик, неработающим дашбордам или остановке системы алертинга.
|
||||
@ -0,0 +1,20 @@
|
||||
name: "VictoriaMetrics Too Many Warning or Error Logs"
|
||||
expression: |
|
||||
sum(increase(vm_log_messages_total{level!="info", job=~".*(agent|vminsert|vmselect|vmstorage|vmauth).*"}[5m])) without (app_version, location, is_printed) > 35
|
||||
threshold: 40
|
||||
condition_type: "gt"
|
||||
for: "15m"
|
||||
need_reduce: true
|
||||
reducer_type: "last"
|
||||
no_data_state: "OK"
|
||||
exec_err_state: "Error"
|
||||
labels:
|
||||
service: "vmcomponents"
|
||||
severity: "warning"
|
||||
status: "test"
|
||||
summary: |
|
||||
Слишком много сообщений типа "error"/"warning" по {{ $labels.job }} от инстанса {{ $labels.instance }}.
|
||||
description: |
|
||||
Компонент '{{ $labels.job }}' (инстанс {{ $labels.instance }}) генерирует слишком много логов уровня 'warning' или 'error'.
|
||||
|
||||
**Влияние:** Это указывает на наличие скрытых проблем, которые могут привести к деградации производительности или будущим сбоям.
|
||||
@ -0,0 +1,28 @@
|
||||
name: "Высокая загрузка CPU"
|
||||
expression: |
|
||||
(
|
||||
1 - avg by (instance) (rate(node_cpu_seconds_total{job="self-monitoring", mode="idle"}[5m]))
|
||||
) * 100
|
||||
threshold: 90
|
||||
for: "5m"
|
||||
condition_type: "gt"
|
||||
need_reduce: true
|
||||
reducer_type: "max"
|
||||
no_data_state: "OK"
|
||||
exec_err_state: "Error"
|
||||
labels:
|
||||
service: "system"
|
||||
severity: "critical"
|
||||
status: "test"
|
||||
summary: |
|
||||
High CPU usage {{ printf "%.0f" $values.B.Value }}% on {{ $labels.instance }}
|
||||
description: |
|
||||
На {{ $labels.instance }} высокая загрузка CPU (более порога в течение заданного времени).
|
||||
Это сигнализирует о CPU-bound нагрузке, которая может повышать задержки и время ответа сервисов.
|
||||
|
||||
Что проверить:
|
||||
1) top/htop: какие процессы потребляют CPU
|
||||
2) mpstat -P ALL 1: распределение по ядрам и steal
|
||||
3) run queue/load average (uptime, vmstat)
|
||||
4) всплески трафика, cron/job-процессы, фоновые задачи
|
||||
5) ошибки и таймауты приложений в логах
|
||||
@ -0,0 +1,26 @@
|
||||
name: "Мало свободной памяти"
|
||||
expression: |
|
||||
(node_memory_MemAvailable_bytes{job="self-monitoring"} / node_memory_MemTotal_bytes{job="self-monitoring"}) * 100
|
||||
threshold: 5
|
||||
for: "5m"
|
||||
condition_type: "lt"
|
||||
need_reduce: true
|
||||
reducer_type: "min"
|
||||
no_data_state: "OK"
|
||||
exec_err_state: "Error"
|
||||
labels:
|
||||
service: "system"
|
||||
severity: "critical"
|
||||
status: "test"
|
||||
summary: |
|
||||
Low free memory {{ printf "%.0f" $values.B.Value }}% on {{ $labels.instance }}
|
||||
description: |
|
||||
На {{ $labels.instance }} осталось очень мало доступной памяти.
|
||||
Это сигнализирует о memory pressure, риске OOM kill и деградации производительности.
|
||||
|
||||
Что проверить:
|
||||
1) free -h, vmstat 1, swapon -s
|
||||
2) top/htop: процессы-лидеры по RSS/heap
|
||||
3) OOM события в dmesg/journalctl
|
||||
4) major page faults и IO wait (связанные алерты)
|
||||
5) лимиты/requests (для k8s) и необходимость увеличения RAM
|
||||
@ -0,0 +1,26 @@
|
||||
name: "Обнаружена перезагрузка сервера"
|
||||
expression: |
|
||||
changes(node_boot_time_seconds{job="self-monitoring"}[5m]) > bool 0
|
||||
threshold: 0
|
||||
for: "5m"
|
||||
condition_type: "gt"
|
||||
need_reduce: true
|
||||
reducer_type: "max"
|
||||
no_data_state: "OK"
|
||||
exec_err_state: "Error"
|
||||
labels:
|
||||
service: "system"
|
||||
severity: "warning"
|
||||
status: "test"
|
||||
summary: |
|
||||
Reboot detected on {{ $labels.instance }}
|
||||
description: |
|
||||
На {{ $labels.instance }} обнаружен недавний перезапуск.
|
||||
Это сигнализирует о возможном аварийном рестарте, плановых работах или проблемах питания/ядра.
|
||||
|
||||
Что проверить:
|
||||
1) last reboot и uptime
|
||||
2) journalctl -b -1 и kernel-логи до перезапуска
|
||||
3) причины: OOM, kernel panic, watchdog, обновления
|
||||
4) состояние сервисов после рестарта
|
||||
5) повторяемость события и корреляцию с другими алертами
|
||||
24
environments/dev/Seahorse/backend.tf
Normal file
24
environments/dev/Seahorse/backend.tf
Normal file
@ -0,0 +1,24 @@
|
||||
terraform {
|
||||
required_providers {
|
||||
grafana = {
|
||||
source = "grafana/grafana"
|
||||
version = ">= 4.7.0"
|
||||
}
|
||||
vault = {
|
||||
source = "hashicorp/vault"
|
||||
}
|
||||
}
|
||||
|
||||
backend "s3" {
|
||||
endpoints = {
|
||||
s3 = "https://storage.yandexcloud.net" }
|
||||
bucket = "monitoring-vcmt-core-deploy"
|
||||
region = "ru-central1"
|
||||
key = "dev-denis-practic/terraform.tfstate"
|
||||
skip_region_validation = true
|
||||
skip_credentials_validation = true
|
||||
skip_requesting_account_id = true
|
||||
skip_s3_checksum = true
|
||||
skip_metadata_api_check = true
|
||||
}
|
||||
}
|
||||
11452
environments/dev/Seahorse/dashboards/self-monitoring/vcmt-cluster.json
Normal file
11452
environments/dev/Seahorse/dashboards/self-monitoring/vcmt-cluster.json
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
50
environments/dev/Seahorse/locals.tf
Normal file
50
environments/dev/Seahorse/locals.tf
Normal file
@ -0,0 +1,50 @@
|
||||
locals {
|
||||
|
||||
# Определяем HTTP заголовки на основе значения disable_provenance
|
||||
grafana_headers = {
|
||||
"X-Disable-Provenance" = var.disable_provenance ? "true" : "false"
|
||||
}
|
||||
|
||||
# Contact points configuration
|
||||
contact_points = [
|
||||
{
|
||||
name = "default"
|
||||
type = "slack"
|
||||
is_default = true
|
||||
settings = {
|
||||
webhook_url = data.vault_kv_secret_v2.secret_ext.data["mmwebhook_default"]
|
||||
template = "../../../modules/grafana_contact_points/template/alerts_message_mm.template"
|
||||
}
|
||||
},
|
||||
|
||||
{
|
||||
name = "infra-alerts-critical"
|
||||
type = "slack"
|
||||
is_default = false
|
||||
settings = {
|
||||
webhook_url = data.vault_kv_secret_v2.secret_ext.data["mmwebhook_infra_alerts_critical"]
|
||||
template = "../../../modules/grafana_contact_points/template/alerts_message_mm.template"
|
||||
}
|
||||
},
|
||||
|
||||
{
|
||||
name = "infra-alerts-informational"
|
||||
type = "slack"
|
||||
is_default = false
|
||||
settings = {
|
||||
webhook_url = data.vault_kv_secret_v2.secret_ext.data["mmwebhook_infra_alerts_info"]
|
||||
template = "../../../modules/grafana_contact_points/template/alerts_message_mm.template"
|
||||
}
|
||||
},
|
||||
|
||||
{
|
||||
name = "infra-alerts-test"
|
||||
type = "slack"
|
||||
is_default = false
|
||||
settings = {
|
||||
webhook_url = data.vault_kv_secret_v2.secret_ext.data["mmwebhook_infra_alerts_test"]
|
||||
template = "../../../modules/grafana_contact_points/template/alerts_message_mm.template"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
21
environments/dev/Seahorse/providers.tf
Normal file
21
environments/dev/Seahorse/providers.tf
Normal file
@ -0,0 +1,21 @@
|
||||
provider "vault" {
|
||||
address = "https://vault.pyn.ru"
|
||||
skip_child_token = true
|
||||
}
|
||||
|
||||
data "vault_kv_secret_v2" "secret_ext" { # Секреты для подключения к внешним источникам (mm, clickhouse и т.д.)
|
||||
mount = "app"
|
||||
name = "groups/infraservice/monitoring/grafana/dev/ext"
|
||||
}
|
||||
|
||||
data "vault_kv_secret_v2" "secret_int" { # Секреты для работы самой графаны
|
||||
mount = "app"
|
||||
name = "groups/infraservice/monitoring/grafana/dev/int"
|
||||
}
|
||||
|
||||
provider "grafana" {
|
||||
alias = "grafana01"
|
||||
url = "https://grafana-dev.hhmon.ru/"
|
||||
auth = data.vault_kv_secret_v2.secret_int.data["grafana_local_admin_password"]
|
||||
http_headers = local.grafana_headers
|
||||
}
|
||||
54
environments/dev/Seahorse/variables_alert.tf
Normal file
54
environments/dev/Seahorse/variables_alert.tf
Normal file
@ -0,0 +1,54 @@
|
||||
# Alert duration and timing configuration
|
||||
variable "interval_seconds" {
|
||||
description = "Interval in seconds for evaluating alerts"
|
||||
type = number
|
||||
default = 60
|
||||
}
|
||||
|
||||
variable "default_interval_ms" {
|
||||
description = "Default interval in milliseconds for evaluating alert expressions"
|
||||
type = number
|
||||
default = 60000
|
||||
}
|
||||
|
||||
variable "default_max_data_points" {
|
||||
description = "Default maximum number of data points"
|
||||
type = number
|
||||
default = 43200
|
||||
}
|
||||
|
||||
variable "default_no_data_state" {
|
||||
description = "Default no data state for alerts"
|
||||
type = string
|
||||
default = "OK"
|
||||
}
|
||||
|
||||
variable "default_exec_err_state" {
|
||||
description = "Default execution error state for alerts"
|
||||
type = string
|
||||
default = "Error"
|
||||
}
|
||||
|
||||
variable "default_alert_duration" {
|
||||
description = "Default duration (in seconds) for how long a condition must be true before alerting"
|
||||
type = number
|
||||
default = 300 # 5 minutes
|
||||
}
|
||||
|
||||
variable "default_evaluation_interval" {
|
||||
description = "Default interval (in seconds) between alert rule evaluations"
|
||||
type = number
|
||||
default = 60 # 1 minute
|
||||
}
|
||||
|
||||
variable "default_time_range_from" {
|
||||
description = "Default time range (in seconds) for main query lookback"
|
||||
type = number
|
||||
default = 604800 # 7 days
|
||||
}
|
||||
|
||||
variable "default_processing_range" {
|
||||
description = "Default time range (in seconds) for processing blocks"
|
||||
type = number
|
||||
default = 600 # 10 minutes
|
||||
}
|
||||
22
environments/dev/Seahorse/variables_auth.tf
Normal file
22
environments/dev/Seahorse/variables_auth.tf
Normal file
@ -0,0 +1,22 @@
|
||||
#variable "grafana_url" {
|
||||
# description = "Grafana URL"
|
||||
# type = string
|
||||
#}
|
||||
|
||||
#variable "grafana_auth" {
|
||||
# description = "Grafana authentication token"
|
||||
# type = string
|
||||
#}
|
||||
|
||||
variable "disable_provenance" {
|
||||
description = "Controls whether Grafana provisioning is disabled"
|
||||
type = bool
|
||||
default = true
|
||||
}
|
||||
|
||||
variable "env" {
|
||||
description = "Grafana environment description"
|
||||
type = string
|
||||
}
|
||||
|
||||
|
||||
22
environments/dev/Seahorse/variables_contact_points.tf
Normal file
22
environments/dev/Seahorse/variables_contact_points.tf
Normal file
@ -0,0 +1,22 @@
|
||||
variable "contact_points" {
|
||||
description = "List of contact points"
|
||||
type = list(object({
|
||||
name = string
|
||||
type = string
|
||||
is_default = optional(bool, false)
|
||||
labels = optional(map(string))
|
||||
settings = map(string)
|
||||
}))
|
||||
default = []
|
||||
}
|
||||
|
||||
|
||||
#output "contact_point_ids01" {
|
||||
# value = module.grafana_contact_points01.contact_point_ids
|
||||
#}
|
||||
|
||||
#output "contact_point_ids02" {
|
||||
# value = module.grafana_contact_points02.contact_point_ids
|
||||
#}
|
||||
|
||||
|
||||
27
environments/dev/Seahorse/variables_datasource.tf
Normal file
27
environments/dev/Seahorse/variables_datasource.tf
Normal file
@ -0,0 +1,27 @@
|
||||
variable "datasources" {
|
||||
description = "List of Grafana data sources"
|
||||
type = list(object({
|
||||
# Main parameters
|
||||
name = string # Data source name (displayed in Grafana)
|
||||
uid = string # Unique source identifier
|
||||
type = string # Data source type (e.g., prometheus, mysql, clickhouse)
|
||||
url = optional(string, null) # Connection URL (for most sources)
|
||||
username = optional(string, null)
|
||||
access_mode = string # Access mode: proxy or direct
|
||||
is_default = bool # Set as default source
|
||||
|
||||
# Authentication settings
|
||||
basic_auth = optional(bool, false) # Use basic authentication
|
||||
basic_auth_user = optional(string, null) # Username for basic authentication
|
||||
basic_auth_password = optional(string, null) # Password for basic authentication
|
||||
|
||||
# Additional parameters
|
||||
json_data = optional(map(any), {}) # Additional parameters in JSON format
|
||||
secure_json_data = optional(map(string), {}) # Sensitive data in JSON format
|
||||
|
||||
# Terraform lifecycle management fields
|
||||
keep_manual_changes = optional(bool, false) # Ignore manual changes in Grafana
|
||||
prevent_destroy_on_recreate = optional(bool, false) # Prevent resource deletion on update
|
||||
}))
|
||||
}
|
||||
|
||||
30
environments/dev/Seahorse/variables_notification_policies.tf
Normal file
30
environments/dev/Seahorse/variables_notification_policies.tf
Normal file
@ -0,0 +1,30 @@
|
||||
variable "notification_policies" {
|
||||
description = "Routing rules for specific label sets"
|
||||
type = list(object({
|
||||
contact_point = string
|
||||
continue = optional(bool)
|
||||
group_by = optional(list(string))
|
||||
group_wait = optional(string)
|
||||
group_interval = optional(string)
|
||||
repeat_interval = optional(string)
|
||||
matchers = list(object({
|
||||
label = string
|
||||
match = string # Allowed operators are = for equality, != for negated equality, =~ for regex equality, and !~ for negated regex equality
|
||||
value = string
|
||||
}))
|
||||
policies = optional(list(object({
|
||||
contact_point = string
|
||||
continue = optional(bool)
|
||||
group_by = optional(list(string))
|
||||
group_wait = optional(string)
|
||||
group_interval = optional(string)
|
||||
repeat_interval = optional(string)
|
||||
matchers = list(object({
|
||||
label = string
|
||||
match = string
|
||||
value = string
|
||||
}))
|
||||
})), [])
|
||||
}))
|
||||
default = []
|
||||
}
|
||||
15
environments/dev/Seahorse/variables_organization.tf
Normal file
15
environments/dev/Seahorse/variables_organization.tf
Normal file
@ -0,0 +1,15 @@
|
||||
# Input variable for organizations at the environment level
|
||||
variable "organizations" {
|
||||
description = "Grafana organization configuration"
|
||||
type = list(object({
|
||||
create_new_organization = bool
|
||||
keep_manual_changes = bool
|
||||
prevent_destroy_on_recreate = bool
|
||||
organization_name = string
|
||||
}))
|
||||
}
|
||||
|
||||
variable "org_id" {
|
||||
description = "Grafana organization ID"
|
||||
type = string
|
||||
}
|
||||
@ -0,0 +1,31 @@
|
||||
name: "DEV ADIBROV - Low Disk Space (10%) - VCMT Nodes TEST ALLERT DEV"
|
||||
expression: |
|
||||
100 - (
|
||||
node_filesystem_avail_bytes{
|
||||
instance=~"ydx-.*:9100",
|
||||
mountpoint!~"^(/sys.*|/proc.*|/dev.*|/run.*|/boot.*)$",
|
||||
fstype=~"(zfs|xfs|ext.)"
|
||||
}
|
||||
* 100
|
||||
/
|
||||
node_filesystem_size_bytes{
|
||||
instance=~"ydx-.*:9100",
|
||||
mountpoint!~"^(/sys.*|/proc.*|/dev.*|/run.*|/boot.*)$",
|
||||
fstype=~"(zfs|xfs|ext.)"
|
||||
}
|
||||
)
|
||||
threshold: 90
|
||||
for: "1m"
|
||||
condition_type: "gt"
|
||||
need_reduce: true
|
||||
reducer_type: "min"
|
||||
no_data_state: "NoData"
|
||||
exec_err_state: "Error"
|
||||
labels:
|
||||
service: "system"
|
||||
severity: "critical"
|
||||
status: "test"
|
||||
summary: |
|
||||
{{ printf "%.0f" $values.B.Value }}% Usage on {{ $labels.mountpoint }} ({{ $labels.instance }})
|
||||
description: |
|
||||
ТЕСТОВЫЙ АЛЛЕРТ В ДЕВ КОНТУРЕ!!!! НЕ РЕАГИРОВАТЬ!!!!
|
||||
@ -0,0 +1,20 @@
|
||||
name: "DEV ADIBROV - Vmagent Persistent Queue Is Dropping DataTEST ALLERT DEV"
|
||||
expression: |
|
||||
sum(increase(vm_persistentqueue_bytes_dropped_total{job=~".*agent.*"}[5m])) without (path) > 0
|
||||
threshold: 0
|
||||
for: "10m"
|
||||
condition_type: "gt"
|
||||
need_reduce: true
|
||||
reducer_type: "sum"
|
||||
no_data_state: "OK"
|
||||
exec_err_state: "KeepLast"
|
||||
labels:
|
||||
service: "vmagent"
|
||||
severity: "critical"
|
||||
status: "test"
|
||||
summary: |
|
||||
Инстанс {{ $labels.instance }} сбрасывает данные из переполненного буфера.
|
||||
description: |
|
||||
VMAgent-у на инстансе {{ $labels.instance }} пришлось сбросить данные из дискового буфера.
|
||||
|
||||
**Влияние: ПРОИСХОДИТ АКТИВНАЯ ПОТЕРЯ МЕТРИК!** Дисковый буфер переполнен, и vmagent удаляет старые данные, чтобы освободить место для новых.
|
||||
@ -0,0 +1,20 @@
|
||||
name: "DEV ADIBROV - Vmagent Too Many Scrape ErrorsTEST ALLERT DEV"
|
||||
expression: |
|
||||
increase(vm_promscrape_scrapes_failed_total{job=~".*agent.*"}[5m]) > 35
|
||||
threshold: 40 # временный порог
|
||||
for: "15m"
|
||||
condition_type: "gt"
|
||||
need_reduce: false
|
||||
no_data_state: "OK"
|
||||
exec_err_state: "KeepLast"
|
||||
labels:
|
||||
service: "vmagent"
|
||||
severity: "warning"
|
||||
status: "test"
|
||||
summary: |
|
||||
Vmagent не может собрать один или несколько target'ов на инстансе {{ $labels.instance }}.
|
||||
description: |
|
||||
Job "{{ $labels.job }}" на инстансе {{ $labels.instance }} не может успешно скрапить target'ы в течение последних 15 минут.
|
||||
|
||||
**Влияние:** ПРОИСХОДИТ ПРЯМАЯ ПОТЕРЯ МЕТРИК ОТ ЦЕЛЕВОГО СЕРВИСА!
|
||||
Вы не получаете данные от одного или нескольких наблюдаемых сервисов. Дашборды и алерты, связанные с этими target'ами, будут показывать неполную или устаревшую информацию в мониторинге.
|
||||
@ -0,0 +1,29 @@
|
||||
name: "DEV ADIBROV - Критически мало места на диске (свободно 10%)TEST ALLERT DEV"
|
||||
expression: |
|
||||
(
|
||||
100
|
||||
- (
|
||||
node_filesystem_avail_bytes{job="self-monitoring", mountpoint!~"^(/sys|/proc|/dev|/run|/boot).*", fstype=~"(zfs|xfs|ext.)"}
|
||||
* 100
|
||||
/ node_filesystem_size_bytes{job="self-monitoring", mountpoint!~"^(/sys|/proc|/dev|/run|/boot).*", fstype=~"(zfs|xfs|ext.)"}
|
||||
)
|
||||
)
|
||||
threshold: 90
|
||||
for: "5m"
|
||||
condition_type: "gt"
|
||||
need_reduce: true
|
||||
reducer_type: "max"
|
||||
no_data_state: "OK"
|
||||
exec_err_state: "Error"
|
||||
labels:
|
||||
service: "system"
|
||||
severity: "critical"
|
||||
status: "test"
|
||||
summary: |
|
||||
Disk usage {{ printf "%.0f" $values.B.Value }}% on {{ $labels.mountpoint }} ({{ $labels.instance }})
|
||||
description: |
|
||||
На {{ $labels.mountpoint }} у {{ $labels.instance }} осталось менее 10% свободного места.
|
||||
Это сигнализирует о критически высоком риске остановки записи, сбоев сервисов и ошибок приложений.
|
||||
|
||||
Что проверить:
|
||||
1ТЕСТОВЫЙ АЛЛЕРТ В ДЕВ КОНТУРЕ!!!! НЕ РЕАГИРОВАТЬ!!!!
|
||||
@ -0,0 +1,29 @@
|
||||
name: "DEV ADIBROV - Мало места на диске (свободно 20%)TEST ALLERT DEV"
|
||||
expression: |
|
||||
(
|
||||
100
|
||||
- (
|
||||
node_filesystem_avail_bytes{job="self-monitoring", mountpoint!~"^(/sys|/proc|/dev|/run|/boot).*", fstype=~"(zfs|xfs|ext.)"}
|
||||
* 100
|
||||
/ node_filesystem_size_bytes{job="self-monitoring", mountpoint!~"^(/sys|/proc|/dev|/run|/boot).*", fstype=~"(zfs|xfs|ext.)"}
|
||||
)
|
||||
)
|
||||
threshold: 80
|
||||
for: "5m"
|
||||
condition_type: "gt"
|
||||
need_reduce: true
|
||||
reducer_type: "max"
|
||||
no_data_state: "OK"
|
||||
exec_err_state: "Error"
|
||||
labels:
|
||||
service: "system"
|
||||
severity: "warning"
|
||||
status: "test"
|
||||
summary: |
|
||||
Disk usage {{ printf "%.0f" $values.B.Value }}% on {{ $labels.mountpoint }} ({{ $labels.instance }})
|
||||
description: |
|
||||
На {{ $labels.mountpoint }} у {{ $labels.instance }} осталось менее 20% свободного места.
|
||||
Это сигнализирует о быстром приближении к исчерпанию места и риске деградации записи.
|
||||
|
||||
Что проверить:
|
||||
ТЕСТОВЫЙ АЛЛЕРТ В ДЕВ КОНТУРЕ!!!! НЕ РЕАГИРОВАТЬ!!!!
|
||||
@ -0,0 +1,29 @@
|
||||
name: "DEV ADIBROV - Мало места на диске (свободно 30%)TEST ALLERT DEV"
|
||||
expression: |
|
||||
(
|
||||
100
|
||||
- (
|
||||
node_filesystem_avail_bytes{job="self-monitoring", mountpoint!~"^(/sys|/proc|/dev|/run|/boot).*", fstype=~"(zfs|xfs|ext.)"}
|
||||
* 100
|
||||
/ node_filesystem_size_bytes{job="self-monitoring", mountpoint!~"^(/sys|/proc|/dev|/run|/boot).*", fstype=~"(zfs|xfs|ext.)"}
|
||||
)
|
||||
)
|
||||
threshold: 70
|
||||
for: "5m"
|
||||
condition_type: "gt"
|
||||
need_reduce: true
|
||||
reducer_type: "max"
|
||||
no_data_state: "OK"
|
||||
exec_err_state: "Error"
|
||||
labels:
|
||||
service: "system"
|
||||
severity: "warning"
|
||||
status: "test"
|
||||
summary: |
|
||||
Disk usage {{ printf "%.0f" $values.B.Value }}% on {{ $labels.mountpoint }} ({{ $labels.instance }})
|
||||
description: |
|
||||
На {{ $labels.mountpoint }} у {{ $labels.instance }} осталось менее 30% свободного места.
|
||||
Это сигнализирует о раннем риске заполнения диска и необходимости плановой очистки.
|
||||
|
||||
Что проверить:
|
||||
ТЕСТОВЫЙ АЛЛЕРТ В ДЕВ КОНТУРЕ!!!! НЕ РЕАГИРОВАТЬ!!!!
|
||||
24
environments/dev/adibrov/backend.tf
Normal file
24
environments/dev/adibrov/backend.tf
Normal file
@ -0,0 +1,24 @@
|
||||
terraform {
|
||||
required_providers {
|
||||
grafana = {
|
||||
source = "grafana/grafana"
|
||||
version = ">= 4.7.0"
|
||||
}
|
||||
vault = {
|
||||
source = "hashicorp/vault"
|
||||
}
|
||||
}
|
||||
|
||||
backend "s3" {
|
||||
endpoints = {
|
||||
s3 = "https://storage.yandexcloud.net" }
|
||||
bucket = "monitoring-vcmt-core-deploy"
|
||||
region = "ru-central1"
|
||||
key = "a.dibrov-practic/terraform.tfstate"
|
||||
skip_region_validation = true
|
||||
skip_credentials_validation = true
|
||||
skip_requesting_account_id = true
|
||||
skip_s3_checksum = true
|
||||
skip_metadata_api_check = true
|
||||
}
|
||||
}
|
||||
3675
environments/dev/adibrov/dashboards/self-monitoring/angie.json
Normal file
3675
environments/dev/adibrov/dashboards/self-monitoring/angie.json
Normal file
File diff suppressed because it is too large
Load Diff
23876
environments/dev/adibrov/dashboards/system/node_exporter_full.json
Normal file
23876
environments/dev/adibrov/dashboards/system/node_exporter_full.json
Normal file
File diff suppressed because it is too large
Load Diff
50
environments/dev/adibrov/locals.tf
Normal file
50
environments/dev/adibrov/locals.tf
Normal file
@ -0,0 +1,50 @@
|
||||
locals {
|
||||
|
||||
# Определяем HTTP заголовки на основе значения disable_provenance
|
||||
grafana_headers = {
|
||||
"X-Disable-Provenance" = var.disable_provenance ? "true" : "false"
|
||||
}
|
||||
|
||||
# Contact points configuration
|
||||
contact_points = [
|
||||
{
|
||||
name = "default"
|
||||
type = "slack"
|
||||
is_default = true
|
||||
settings = {
|
||||
webhook_url = data.vault_kv_secret_v2.secret_ext.data["mmwebhook_default"]
|
||||
template = "../../../modules/grafana_contact_points/template/alerts_message_mm.template"
|
||||
}
|
||||
},
|
||||
|
||||
{
|
||||
name = "infra-alerts-critical"
|
||||
type = "slack"
|
||||
is_default = false
|
||||
settings = {
|
||||
webhook_url = data.vault_kv_secret_v2.secret_ext.data["mmwebhook_infra_alerts_critical"]
|
||||
template = "../../../modules/grafana_contact_points/template/alerts_message_mm.template"
|
||||
}
|
||||
},
|
||||
|
||||
{
|
||||
name = "infra-alerts-informational"
|
||||
type = "slack"
|
||||
is_default = false
|
||||
settings = {
|
||||
webhook_url = data.vault_kv_secret_v2.secret_ext.data["mmwebhook_infra_alerts_info"]
|
||||
template = "../../../modules/grafana_contact_points/template/alerts_message_mm.template"
|
||||
}
|
||||
},
|
||||
|
||||
{
|
||||
name = "infra-alerts-test"
|
||||
type = "slack"
|
||||
is_default = false
|
||||
settings = {
|
||||
webhook_url = data.vault_kv_secret_v2.secret_ext.data["mmwebhook_infra_alerts_test"]
|
||||
template = "../../../modules/grafana_contact_points/template/alerts_message_mm.template"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
21
environments/dev/adibrov/providers.tf
Normal file
21
environments/dev/adibrov/providers.tf
Normal file
@ -0,0 +1,21 @@
|
||||
provider "vault" {
|
||||
address = "https://vault.pyn.ru"
|
||||
skip_child_token = true
|
||||
}
|
||||
|
||||
data "vault_kv_secret_v2" "secret_ext" { # Секреты для подключения к внешним источникам (mm, clickhouse и т.д.)
|
||||
mount = "app"
|
||||
name = "groups/infraservice/monitoring/grafana/dev/ext"
|
||||
}
|
||||
|
||||
data "vault_kv_secret_v2" "secret_int" { # Секреты для работы самой графаны
|
||||
mount = "app"
|
||||
name = "groups/infraservice/monitoring/grafana/dev/int"
|
||||
}
|
||||
|
||||
provider "grafana" {
|
||||
alias = "grafana01"
|
||||
url = "https://grafana-dev.hhmon.ru/"
|
||||
auth = data.vault_kv_secret_v2.secret_int.data["grafana_local_admin_password"]
|
||||
http_headers = local.grafana_headers
|
||||
}
|
||||
54
environments/dev/adibrov/variables_alert.tf
Normal file
54
environments/dev/adibrov/variables_alert.tf
Normal file
@ -0,0 +1,54 @@
|
||||
# Alert duration and timing configuration
|
||||
variable "interval_seconds" {
|
||||
description = "Interval in seconds for evaluating alerts"
|
||||
type = number
|
||||
default = 60
|
||||
}
|
||||
|
||||
variable "default_interval_ms" {
|
||||
description = "Default interval in milliseconds for evaluating alert expressions"
|
||||
type = number
|
||||
default = 60000
|
||||
}
|
||||
|
||||
variable "default_max_data_points" {
|
||||
description = "Default maximum number of data points"
|
||||
type = number
|
||||
default = 43200
|
||||
}
|
||||
|
||||
variable "default_no_data_state" {
|
||||
description = "Default no data state for alerts"
|
||||
type = string
|
||||
default = "OK"
|
||||
}
|
||||
|
||||
variable "default_exec_err_state" {
|
||||
description = "Default execution error state for alerts"
|
||||
type = string
|
||||
default = "Error"
|
||||
}
|
||||
|
||||
variable "default_alert_duration" {
|
||||
description = "Default duration (in seconds) for how long a condition must be true before alerting"
|
||||
type = number
|
||||
default = 300 # 5 minutes
|
||||
}
|
||||
|
||||
variable "default_evaluation_interval" {
|
||||
description = "Default interval (in seconds) between alert rule evaluations"
|
||||
type = number
|
||||
default = 60 # 1 minute
|
||||
}
|
||||
|
||||
variable "default_time_range_from" {
|
||||
description = "Default time range (in seconds) for main query lookback"
|
||||
type = number
|
||||
default = 604800 # 7 days
|
||||
}
|
||||
|
||||
variable "default_processing_range" {
|
||||
description = "Default time range (in seconds) for processing blocks"
|
||||
type = number
|
||||
default = 600 # 10 minutes
|
||||
}
|
||||
22
environments/dev/adibrov/variables_auth.tf
Normal file
22
environments/dev/adibrov/variables_auth.tf
Normal file
@ -0,0 +1,22 @@
|
||||
#variable "grafana_url" {
|
||||
# description = "Grafana URL"
|
||||
# type = string
|
||||
#}
|
||||
|
||||
#variable "grafana_auth" {
|
||||
# description = "Grafana authentication token"
|
||||
# type = string
|
||||
#}
|
||||
|
||||
variable "disable_provenance" {
|
||||
description = "Controls whether Grafana provisioning is disabled"
|
||||
type = bool
|
||||
default = true
|
||||
}
|
||||
|
||||
variable "env" {
|
||||
description = "Grafana environment description"
|
||||
type = string
|
||||
}
|
||||
|
||||
|
||||
22
environments/dev/adibrov/variables_contact_points.tf
Normal file
22
environments/dev/adibrov/variables_contact_points.tf
Normal file
@ -0,0 +1,22 @@
|
||||
variable "contact_points" {
|
||||
description = "List of contact points"
|
||||
type = list(object({
|
||||
name = string
|
||||
type = string
|
||||
is_default = optional(bool, false)
|
||||
labels = optional(map(string))
|
||||
settings = map(string)
|
||||
}))
|
||||
default = []
|
||||
}
|
||||
|
||||
|
||||
#output "contact_point_ids01" {
|
||||
# value = module.grafana_contact_points01.contact_point_ids
|
||||
#}
|
||||
|
||||
#output "contact_point_ids02" {
|
||||
# value = module.grafana_contact_points02.contact_point_ids
|
||||
#}
|
||||
|
||||
|
||||
27
environments/dev/adibrov/variables_datasource.tf
Normal file
27
environments/dev/adibrov/variables_datasource.tf
Normal file
@ -0,0 +1,27 @@
|
||||
variable "datasources" {
|
||||
description = "List of Grafana data sources"
|
||||
type = list(object({
|
||||
# Main parameters
|
||||
name = string # Data source name (displayed in Grafana)
|
||||
uid = string # Unique source identifier
|
||||
type = string # Data source type (e.g., prometheus, mysql, clickhouse)
|
||||
url = optional(string, null) # Connection URL (for most sources)
|
||||
username = optional(string, null)
|
||||
access_mode = string # Access mode: proxy or direct
|
||||
is_default = bool # Set as default source
|
||||
|
||||
# Authentication settings
|
||||
basic_auth = optional(bool, false) # Use basic authentication
|
||||
basic_auth_user = optional(string, null) # Username for basic authentication
|
||||
basic_auth_password = optional(string, null) # Password for basic authentication
|
||||
|
||||
# Additional parameters
|
||||
json_data = optional(map(any), {}) # Additional parameters in JSON format
|
||||
secure_json_data = optional(map(string), {}) # Sensitive data in JSON format
|
||||
|
||||
# Terraform lifecycle management fields
|
||||
keep_manual_changes = optional(bool, false) # Ignore manual changes in Grafana
|
||||
prevent_destroy_on_recreate = optional(bool, false) # Prevent resource deletion on update
|
||||
}))
|
||||
}
|
||||
|
||||
30
environments/dev/adibrov/variables_notification_policies.tf
Normal file
30
environments/dev/adibrov/variables_notification_policies.tf
Normal file
@ -0,0 +1,30 @@
|
||||
variable "notification_policies" {
|
||||
description = "Routing rules for specific label sets"
|
||||
type = list(object({
|
||||
contact_point = string
|
||||
continue = optional(bool)
|
||||
group_by = optional(list(string))
|
||||
group_wait = optional(string)
|
||||
group_interval = optional(string)
|
||||
repeat_interval = optional(string)
|
||||
matchers = list(object({
|
||||
label = string
|
||||
match = string # Allowed operators are = for equality, != for negated equality, =~ for regex equality, and !~ for negated regex equality
|
||||
value = string
|
||||
}))
|
||||
policies = optional(list(object({
|
||||
contact_point = string
|
||||
continue = optional(bool)
|
||||
group_by = optional(list(string))
|
||||
group_wait = optional(string)
|
||||
group_interval = optional(string)
|
||||
repeat_interval = optional(string)
|
||||
matchers = list(object({
|
||||
label = string
|
||||
match = string
|
||||
value = string
|
||||
}))
|
||||
})), [])
|
||||
}))
|
||||
default = []
|
||||
}
|
||||
15
environments/dev/adibrov/variables_organization.tf
Normal file
15
environments/dev/adibrov/variables_organization.tf
Normal file
@ -0,0 +1,15 @@
|
||||
# Input variable for organizations at the environment level
|
||||
variable "organizations" {
|
||||
description = "Grafana organization configuration"
|
||||
type = list(object({
|
||||
create_new_organization = bool
|
||||
keep_manual_changes = bool
|
||||
prevent_destroy_on_recreate = bool
|
||||
organization_name = string
|
||||
}))
|
||||
}
|
||||
|
||||
variable "org_id" {
|
||||
description = "Grafana organization ID"
|
||||
type = string
|
||||
}
|
||||
33
environments/modules/grafana_contact_points/locals.tf
Normal file
33
environments/modules/grafana_contact_points/locals.tf
Normal file
@ -0,0 +1,33 @@
|
||||
locals {
|
||||
# Default template for Telegram messages
|
||||
default_telegram_message_template = try(
|
||||
fileexists("${path.module}/template/alerts_message_tg.template") ?
|
||||
replace(file("${path.module}/template/alerts_message_tg.template"), "__ORG_ID__", var.org_id) :
|
||||
"Default message template",
|
||||
"Default message template"
|
||||
)
|
||||
|
||||
# Preparing settings for each contact point
|
||||
contact_point_templates = {
|
||||
for cp in var.contact_points :
|
||||
cp.name => {
|
||||
name = cp.name
|
||||
type = cp.type
|
||||
settings = cp.settings
|
||||
# Check and load template from variables if specified and file exists
|
||||
template = try(
|
||||
(cp.settings["template"] != null && fileexists(cp.settings["template"])) ?
|
||||
replace(replace(replace(file(cp.settings["template"]), "__ENV__", var.env), "__ORG_ID__", var.org_id), "__GRAFANA_URL__", var.grafana_url) :
|
||||
local.default_telegram_message_template,
|
||||
local.default_telegram_message_template
|
||||
)
|
||||
# Check and load title template if specified and file exists
|
||||
title = try(
|
||||
(cp.settings["title_template"] != null && fileexists(cp.settings["title_template"])) ?
|
||||
replace(replace(replace(file(cp.settings["title_template"]), "__ENV__", var.env), "__ORG_ID__", var.org_id), "__GRAFANA_URL__", var.grafana_url) :
|
||||
null,
|
||||
null
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
26
environments/modules/grafana_contact_points/main.tf
Normal file
26
environments/modules/grafana_contact_points/main.tf
Normal file
@ -0,0 +1,26 @@
|
||||
resource "grafana_contact_point" "contact_points" {
|
||||
for_each = { for cp in var.contact_points : "${cp.name}_${cp.type}" => cp }
|
||||
|
||||
org_id = var.org_id
|
||||
name = each.value.name
|
||||
disable_provenance = var.disable_provenance
|
||||
|
||||
dynamic "telegram" {
|
||||
for_each = each.value.type == "telegram" ? [1] : []
|
||||
content {
|
||||
chat_id = each.value.settings["chat_id"]
|
||||
token = each.value.settings["bot_token"]
|
||||
message = local.contact_point_templates[each.value.name].template
|
||||
parse_mode = "HTML"
|
||||
}
|
||||
}
|
||||
|
||||
dynamic "slack" {
|
||||
for_each = each.value.type == "slack" ? [1] : []
|
||||
content {
|
||||
url = each.value.settings["webhook_url"]
|
||||
title = try(local.contact_point_templates[each.value.name].title, null)
|
||||
text = local.contact_point_templates[each.value.name].template
|
||||
}
|
||||
}
|
||||
}
|
||||
4
environments/modules/grafana_contact_points/outputs.tf
Normal file
4
environments/modules/grafana_contact_points/outputs.tf
Normal file
@ -0,0 +1,4 @@
|
||||
output "contact_point_ids" {
|
||||
description = "Mapping of contact point names to their IDs"
|
||||
value = { for k, v in grafana_contact_point.contact_points : k => v.id }
|
||||
}
|
||||
@ -0,0 +1,144 @@
|
||||
{{- define "print_links" -}}
|
||||
{{- /* Panel link and GeneratorURL are both primary references */ -}}
|
||||
{{- if .PanelURL -}}
|
||||
[📊 Graph]({{ .PanelURL }})
|
||||
{{- else -}}
|
||||
{{- if and .Labels (index .Labels "panel_id") -}}
|
||||
{{- $dashboard_uid := index .Labels "dashboard_uid" -}}
|
||||
{{- if not $dashboard_uid -}}
|
||||
{{- $service := index .Labels "service" -}}
|
||||
{{- if $service -}}
|
||||
{{- $dashboard_uid = printf "%s-dashboard" $service -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
{{- if $dashboard_uid -}}
|
||||
{{- $url := printf "__GRAFANA_URL__/d/%s?orgId=__ORG_ID__&viewPanel=%s&refresh=2s" $dashboard_uid (index .Labels "panel_id") -}}
|
||||
[📊 Graph]({{ $url }})
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
{{- if and (eq .Status "firing") (.GeneratorURL) -}}
|
||||
{{- " " -}}[⚡ Check Alert]({{ .GeneratorURL }})
|
||||
{{- end -}}
|
||||
{{- if and (eq .Status "firing") (.SilenceURL) -}}
|
||||
{{- " " -}}[🤐 Mute]({{ .SilenceURL }})
|
||||
{{- end -}}
|
||||
{{- if .DashboardURL -}}
|
||||
{{- " " -}}[📈 Dashboard]({{ .DashboardURL }})
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- define "print_alert" -}}
|
||||
{{- $status := "" -}}
|
||||
{{- $severity := "" -}}
|
||||
{{- if index .Labels "severity" -}}
|
||||
{{- $severity = index .Labels "severity" -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- if eq .Status "resolved" -}}
|
||||
{{- $status = "🟢" -}}
|
||||
{{- else if eq $severity "disaster" -}}
|
||||
{{- $status = "🔴" -}}
|
||||
{{- else if eq $severity "critical" -}}
|
||||
{{- $status = "🟣" -}}
|
||||
{{- else if eq $severity "performance" -}}
|
||||
{{- $status = "🟡" -}}
|
||||
{{- else -}}
|
||||
{{- $status = "🟠" -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- $summary := index .Annotations "summary" -}}
|
||||
{{- $description := index .Annotations "description" -}}
|
||||
{{- $alertname := index .Labels "alertname" -}}
|
||||
{{- if or (eq $alertname "DatasourceNoData") (eq $alertname "DatasourceError") -}}
|
||||
{{- $summary = $alertname -}}
|
||||
{{- $alertname = index .Labels "rulename" -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- $instance := index .Labels "instance" -}}
|
||||
{{- $service := "" -}}
|
||||
{{- if index .Labels "service" -}}
|
||||
{{- $service = index .Labels "service" -}}
|
||||
{{- end -}}
|
||||
|
||||
{{ $status }} **{{ $instance }}**{{ "\n" }}
|
||||
{{- if $service -}}
|
||||
{{ $service }}: {{ $alertname }}{{ "\n" }}
|
||||
{{- else -}}
|
||||
{{ $alertname }}{{ "\n" }}
|
||||
{{- end -}}
|
||||
|
||||
{{- if eq $severity "disaster" -}}🚨 {{ $summary }}{{ "\n" }}
|
||||
{{- else if eq $severity "critical" -}}🚨 {{ $summary }}{{ "\n" }}
|
||||
{{- else if eq $severity "performance" -}}🐌 {{ $summary }}{{ "\n" }}
|
||||
{{- else -}}⚠️ {{ $summary }}{{ "\n" }}
|
||||
{{- end -}}
|
||||
|
||||
{{- if $description -}}
|
||||
{{- "\n" -}}
|
||||
{{ $description }}{{ "\n" }}
|
||||
{{- end -}}
|
||||
|
||||
{{- $common_labels := coll.Dict "environment" "__ENV__" -}}
|
||||
{{- range $label, $value := $common_labels -}}
|
||||
`{{ $label }}: {{ $value }}`{{ "\n" }}
|
||||
{{- end -}}
|
||||
|
||||
{{- $has_labels := false -}}
|
||||
{{- range $label, $value := .Labels -}}
|
||||
{{- if and (ne $label "alertname") (ne $label "datasource_uid") (ne $label "grafana_folder") (ne $label "job")
|
||||
(ne $label "details") (ne $label "ref_id") (ne $label "rulename") (ne $label "instance")
|
||||
(ne $label "service") (ne $label "severity") (ne $label "dashboard_uid") (ne $label "panel_id")
|
||||
(ne $label "ip_version") (ne $label "ip") (ne $label "hostname") (ne $label "role") (ne $label "team")
|
||||
(ne $label "id") (ne $label "endpoint") (ne $label "hostgroup") (ne $label "module")
|
||||
(ne $label "servername") (ne $label "type") (ne $label "vm_project_id") (ne $label "vm_account_id") -}}
|
||||
{{- $has_labels = true -}}
|
||||
{{- break -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- if $has_labels -}}
|
||||
{{- range $label, $value := .Labels -}}
|
||||
{{- if and (ne $label "alertname") (ne $label "datasource_uid") (ne $label "grafana_folder") (ne $label "job")
|
||||
(ne $label "details") (ne $label "ref_id") (ne $label "rulename") (ne $label "instance")
|
||||
(ne $label "service") (ne $label "severity") (ne $label "dashboard_uid") (ne $label "panel_id")
|
||||
(ne $label "ip_version") (ne $label "ip") (ne $label "hostname") (ne $label "role") (ne $label "team")
|
||||
(ne $label "id") (ne $label "endpoint") (ne $label "hostgroup") (ne $label "module")
|
||||
(ne $label "servername") (ne $label "type") (ne $label "vm_project_id") (ne $label "vm_account_id") -}}
|
||||
`{{ $label }}: {{ $value }}`{{ "\n" }}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- if eq .Status "firing" -}}
|
||||
{{- "\n" -}}
|
||||
{{- template "print_links" . -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- define "print_alerts" -}}
|
||||
{{- range $index, $alert := . -}}
|
||||
{{- if ne $index 0 -}}
|
||||
{{ "---" }}{{ "\n" }}
|
||||
{{- end -}}
|
||||
{{- template "print_alert" $alert -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- with .Alerts -}}
|
||||
{{- if .Firing -}}
|
||||
**🔥 Firing Alerts**{{ "\n" }}
|
||||
{{ "---" }}{{ "\n" }}
|
||||
{{- template "print_alerts" .Firing -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- if and .Firing .Resolved -}}
|
||||
{{ "\n\n" }}
|
||||
{{- end -}}
|
||||
|
||||
{{- if .Resolved -}}
|
||||
**✅ Resolved Alerts**{{ "\n" }}
|
||||
{{ "---" }}{{ "\n" }}
|
||||
{{- template "print_alerts" .Resolved -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
@ -0,0 +1,127 @@
|
||||
{{- define "print_links" -}}
|
||||
{{- /* Panel link and GeneratorURL are both primary references */ -}}
|
||||
{{- if .PanelURL -}}
|
||||
<{{ .PanelURL }}|📊 Graph>
|
||||
{{- else -}}
|
||||
{{- /* Use dashboard_uid and panel_id from labels with fallback to service-based dashboard_uid */ -}}
|
||||
{{- if and .Labels (index .Labels "panel_id") -}}
|
||||
{{- $dashboard_uid := index .Labels "dashboard_uid" -}}
|
||||
{{- if not $dashboard_uid -}}
|
||||
{{- $service := index .Labels "service" -}}
|
||||
{{- if $service -}}
|
||||
{{- $dashboard_uid = printf "%s-dashboard" $service -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
{{- if and $dashboard_uid -}}
|
||||
{{- $url := printf "__GRAFANA_URL__/d/%s?orgId=__ORG_ID__&viewPanel=%s&refresh=2s" $dashboard_uid (index .Labels "panel_id") -}}
|
||||
<{{ $url }}|📊 Graph>
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
{{- if and (eq .Status "firing") (.GeneratorURL) -}}
|
||||
{{- " " -}}<{{ .GeneratorURL }}|⚡ Check Alert>
|
||||
{{- end -}}
|
||||
{{- if and (eq .Status "firing") (.SilenceURL) -}}
|
||||
{{- " " -}}<{{ .SilenceURL }}|🤐 Mute>
|
||||
{{- end -}}
|
||||
{{- if .DashboardURL -}}
|
||||
{{- " " -}}<{{ .DashboardURL }}|dashboard>
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- define "print_alert" -}}
|
||||
{{- $status := "" -}}
|
||||
{{- $severity := "" -}}
|
||||
{{- if index .Labels "severity" -}}
|
||||
{{- $severity = index .Labels "severity" -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- if eq .Status "resolved" -}}
|
||||
{{- $status = "🟢" -}}
|
||||
{{- else if eq $severity "disaster" -}}
|
||||
{{- $status = "🔴" -}}
|
||||
{{- else if eq $severity "critical" -}}
|
||||
{{- $status = "🔴" -}}
|
||||
{{- else if eq $severity "performance" -}}
|
||||
{{- $status = "🟡" -}}
|
||||
{{- else -}}
|
||||
{{- $status = "🟠" -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- $summary := index .Annotations "summary" -}}
|
||||
{{- $description := index .Annotations "description" -}}
|
||||
{{- $alertname := index .Labels "alertname" -}}
|
||||
{{- if or (eq $alertname "DatasourceNoData") (eq $alertname "DatasourceError") -}}
|
||||
{{- $summary = $alertname -}}
|
||||
{{- $alertname = index .Labels "rulename" -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- $instance := index .Labels "instance" -}}
|
||||
{{- $service := "" -}}
|
||||
{{- if index .Labels "service" -}}
|
||||
{{- $service = index .Labels "service" -}}
|
||||
{{- end -}}
|
||||
{{ $status }} **{{ $instance }}** {{ "\n" }}
|
||||
{{- if $service -}}
|
||||
{{ $service }}: {{ $alertname }}{{ "\n" }}
|
||||
{{- else -}}
|
||||
{{ $alertname }}{{ "\n" }}
|
||||
{{- end -}}
|
||||
{{ "\n" }}
|
||||
{{- if eq $severity "disaster" -}}🚨 {{ $summary }}{{ "\n" }}
|
||||
{{- else if eq $severity "critical" -}}🚨 {{ $summary }}{{ "\n" }}
|
||||
{{- else if eq $severity "performance" -}}🐌 {{ $summary }}{{ "\n" }}
|
||||
{{- else -}}⚠️ {{ $summary }}{{ "\n" }}
|
||||
{{- end -}}
|
||||
|
||||
{{- if $description -}}
|
||||
{{- "\n" -}}
|
||||
{{ $description }}{{ "\n" }}
|
||||
{{- end -}}
|
||||
|
||||
{{- $has_labels := false -}}
|
||||
{{- range $label, $value := .Labels -}}
|
||||
{{- if and (ne $label "alertname") (ne $label "datasource_uid") (ne $label "grafana_folder") (ne $label "job") (ne $label "details") (ne $label "ref_id") (ne $label "rulename") (ne $label "instance") (ne $label "service") (ne $label "severity") (ne $label "dashboard_uid") (ne $label "panel_id") (ne $label "ip_version") (ne $label "ip") (ne $label "hostname") (ne $label "role") (ne $label "team") (ne $label "id") (ne $label "endpoint") (ne $label "hostgroup") (ne $label "module") (ne $label "servername") (ne $label "type") (ne $label "vm_project_id") (ne $label "vm_account_id") (ne $label "environment") (ne $label "rack") (ne $label "server_type") -}}
|
||||
{{- $has_labels = true -}}
|
||||
{{- break -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- if $has_labels -}}
|
||||
{{- range $label, $value := .Labels -}}
|
||||
{{- if and (ne $label "alertname") (ne $label "datasource_uid") (ne $label "grafana_folder") (ne $label "job") (ne $label "details") (ne $label "ref_id") (ne $label "rulename") (ne $label "instance") (ne $label "service") (ne $label "severity") (ne $label "dashboard_uid") (ne $label "panel_id") (ne $label "ip_version") (ne $label "ip") (ne $label "hostname") (ne $label "role") (ne $label "team") (ne $label "id") (ne $label "endpoint") (ne $label "hostgroup") (ne $label "module") (ne $label "servername") (ne $label "type") (ne $label "vm_project_id") (ne $label "vm_account_id") (ne $label "environment") (ne $label "rack") (ne $label "server_type") -}}
|
||||
{{ $label }}: {{ $value }}{{ "\n" }}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- if eq .Status "firing" -}}
|
||||
{{- if $has_labels -}}{{- "\n" -}}{{- end -}}
|
||||
{{- template "print_links" . -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- define "print_alerts" -}}
|
||||
{{- range $index, $alert := . -}}
|
||||
{{- if ne $index 0 -}}
|
||||
{{ "---" }}{{ "\n" }}
|
||||
{{- end -}}
|
||||
{{- template "print_alert" $alert -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- with .Alerts -}}
|
||||
{{- if .Firing -}}
|
||||
**🔥 Firing Alerts**{{ "\n" }}
|
||||
{{ "---" }}{{ "\n" }}
|
||||
{{- template "print_alerts" .Firing -}}
|
||||
{{- end -}}
|
||||
{{- if and .Firing .Resolved -}}
|
||||
{{ "\n\n" }}
|
||||
{{- end -}}
|
||||
{{- if .Resolved -}}
|
||||
**✅ Resolved Alerts**{{ "\n" }}
|
||||
{{ "---" }}{{ "\n" }}
|
||||
{{- template "print_alerts" .Resolved -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
@ -0,0 +1,127 @@
|
||||
{{- define "print_links" -}}
|
||||
{{- /* Panel link and GeneratorURL are both primary references */ -}}
|
||||
{{- if .PanelURL -}}
|
||||
<{{ .PanelURL }}|📊 Graph>
|
||||
{{- else -}}
|
||||
{{- /* Use dashboard_uid and panel_id from labels with fallback to service-based dashboard_uid */ -}}
|
||||
{{- if and .Labels (index .Labels "panel_id") -}}
|
||||
{{- $dashboard_uid := index .Labels "dashboard_uid" -}}
|
||||
{{- if not $dashboard_uid -}}
|
||||
{{- $service := index .Labels "service" -}}
|
||||
{{- if $service -}}
|
||||
{{- $dashboard_uid = printf "%s-dashboard" $service -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
{{- if and $dashboard_uid -}}
|
||||
{{- $url := printf "__GRAFANA_URL__/d/%s?orgId=__ORG_ID__&viewPanel=%s&refresh=2s" $dashboard_uid (index .Labels "panel_id") -}}
|
||||
<{{ $url }}|📊 Graph>
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
{{- if and (eq .Status "firing") (.GeneratorURL) -}}
|
||||
{{- " " -}}<{{ .GeneratorURL }}|⚡ Check Alert>
|
||||
{{- end -}}
|
||||
{{- if and (eq .Status "firing") (.SilenceURL) -}}
|
||||
{{- " " -}}<{{ .SilenceURL }}|🤐 Mute>
|
||||
{{- end -}}
|
||||
{{- if .DashboardURL -}}
|
||||
{{- " " -}}<{{ .DashboardURL }}|dashboard>
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- define "print_alert" -}}
|
||||
{{- $status := "" -}}
|
||||
{{- $severity := "" -}}
|
||||
{{- if index .Labels "severity" -}}
|
||||
{{- $severity = index .Labels "severity" -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- if eq .Status "resolved" -}}
|
||||
{{- $status = "🟢" -}}
|
||||
{{- else if eq $severity "disaster" -}}
|
||||
{{- $status = "🔴" -}}
|
||||
{{- else if eq $severity "critical" -}}
|
||||
{{- $status = "🔴" -}}
|
||||
{{- else if eq $severity "performance" -}}
|
||||
{{- $status = "🟡" -}}
|
||||
{{- else -}}
|
||||
{{- $status = "🟠" -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- $summary := index .Annotations "summary" -}}
|
||||
{{- $description := index .Annotations "description" -}}
|
||||
{{- $alertname := index .Labels "alertname" -}}
|
||||
{{- if or (eq $alertname "DatasourceNoData") (eq $alertname "DatasourceError") -}}
|
||||
{{- $summary = $alertname -}}
|
||||
{{- $alertname = index .Labels "rulename" -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- $instance := index .Labels "instance" -}}
|
||||
{{- $service := "" -}}
|
||||
{{- if index .Labels "service" -}}
|
||||
{{- $service = index .Labels "service" -}}
|
||||
{{- end -}}
|
||||
{{ $status }} **{{ $instance }}** {{ "\n" }}
|
||||
{{- if $service -}}
|
||||
{{ $service }}: {{ $alertname }}{{ "\n" }}
|
||||
{{- else -}}
|
||||
{{ $alertname }}{{ "\n" }}
|
||||
{{- end -}}
|
||||
{{ "\n" }}
|
||||
{{- if eq $severity "disaster" -}}🚨 {{ $summary }}{{ "\n" }}
|
||||
{{- else if eq $severity "critical" -}}🚨 {{ $summary }}{{ "\n" }}
|
||||
{{- else if eq $severity "performance" -}}🐌 {{ $summary }}{{ "\n" }}
|
||||
{{- else -}}⚠️ {{ $summary }}{{ "\n" }}
|
||||
{{- end -}}
|
||||
|
||||
{{- if $description -}}
|
||||
{{- "\n" -}}
|
||||
{{- $description -}}{{ "\n" }}
|
||||
{{- end -}}
|
||||
|
||||
{{- $has_labels := false -}}
|
||||
{{- range $label, $value := .Labels -}}
|
||||
{{- if and (ne $label "alertname") (ne $label "datasource_uid") (ne $label "grafana_folder") (ne $label "job") (ne $label "details") (ne $label "ref_id") (ne $label "rulename") (ne $label "instance") (ne $label "service") (ne $label "severity") (ne $label "dashboard_uid") (ne $label "panel_id") (ne $label "ip_version") (ne $label "ip") (ne $label "hostname") (ne $label "role") (ne $label "team") (ne $label "id") (ne $label "endpoint") (ne $label "hostgroup") (ne $label "module") (ne $label "servername") (ne $label "type") (ne $label "vm_project_id") (ne $label "vm_account_id") (ne $label "environment") (ne $label "rack") (ne $label "server_type") -}}
|
||||
{{- $has_labels = true -}}
|
||||
{{- break -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- if $has_labels -}}
|
||||
{{- range $label, $value := .Labels -}}
|
||||
{{- if and (ne $label "alertname") (ne $label "datasource_uid") (ne $label "grafana_folder") (ne $label "job") (ne $label "details") (ne $label "ref_id") (ne $label "rulename") (ne $label "instance") (ne $label "service") (ne $label "severity") (ne $label "dashboard_uid") (ne $label "panel_id") (ne $label "ip_version") (ne $label "ip") (ne $label "hostname") (ne $label "role") (ne $label "team") (ne $label "id") (ne $label "endpoint") (ne $label "hostgroup") (ne $label "module") (ne $label "servername") (ne $label "type") (ne $label "vm_project_id") (ne $label "vm_account_id") (ne $label "environment") (ne $label "rack") (ne $label "server_type") -}}
|
||||
{{ $label }}: {{ $value }}{{ "\n" }}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- if eq .Status "firing" -}}
|
||||
{{- if $has_labels -}}{{- "\n" -}}{{- end -}}
|
||||
{{- template "print_links" . -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- define "print_alerts" -}}
|
||||
{{- range $index, $alert := . -}}
|
||||
{{- if ne $index 0 -}}
|
||||
{{ "---" }}{{ "\n" }}
|
||||
{{- end -}}
|
||||
{{- template "print_alert" $alert -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- with .Alerts -}}
|
||||
{{- if .Firing -}}
|
||||
**🔥 Firing Alerts**{{ "\n" }}
|
||||
{{ "---" }}{{ "\n" }}
|
||||
{{- template "print_alerts" .Firing -}}
|
||||
{{- end -}}
|
||||
{{- if and .Firing .Resolved -}}
|
||||
{{ "\n\n" }}
|
||||
{{- end -}}
|
||||
{{- if .Resolved -}}
|
||||
**✅ Resolved Alerts**{{ "\n" }}
|
||||
{{ "---" }}{{ "\n" }}
|
||||
{{- template "print_alerts" .Resolved -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
@ -0,0 +1,138 @@
|
||||
{{- define "print_links" -}}
|
||||
{{- /* Panel link and GeneratorURL are both primary references */ -}}
|
||||
{{- if .PanelURL -}}
|
||||
[📊 Graph]({{ .PanelURL }})
|
||||
{{- else -}}
|
||||
{{- if and .Labels (index .Labels "panel_id") -}}
|
||||
{{- $dashboard_uid := index .Labels "dashboard_uid" -}}
|
||||
{{- if not $dashboard_uid -}}
|
||||
{{- $service := index .Labels "service" -}}
|
||||
{{- if $service -}}
|
||||
{{- $dashboard_uid = printf "%s-dashboard" $service -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
{{- if $dashboard_uid -}}
|
||||
{{- $url := printf "__GRAFANA_URL__/d/%s?orgId=__ORG_ID__&viewPanel=%s&refresh=2s" $dashboard_uid (index .Labels "panel_id") -}}
|
||||
[📊 Graph]({{ $url }})
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
{{- if and (eq .Status "firing") (.GeneratorURL) -}}
|
||||
{{- " " -}}[⚡ Check Alert]({{ .GeneratorURL }})
|
||||
{{- end -}}
|
||||
{{- if and (eq .Status "firing") (.SilenceURL) -}}
|
||||
{{- " " -}}[🤐 Mute]({{ .SilenceURL }})
|
||||
{{- end -}}
|
||||
{{- if .DashboardURL -}}
|
||||
{{- " " -}}[📈 Dashboard]({{ .DashboardURL }})
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- define "print_alert" -}}
|
||||
{{- $status := "" -}}
|
||||
{{- $severity := "" -}}
|
||||
{{- if index .Labels "severity" -}}
|
||||
{{- $severity = index .Labels "severity" -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- if eq .Status "resolved" -}}
|
||||
{{- $status = "🟢" -}}
|
||||
{{- else if eq $severity "disaster" -}}
|
||||
{{- $status = "🔴" -}}
|
||||
{{- else if eq $severity "critical" -}}
|
||||
{{- $status = "🟣" -}}
|
||||
{{- else if eq $severity "performance" -}}
|
||||
{{- $status = "🟡" -}}
|
||||
{{- else -}}
|
||||
{{- $status = "🟠" -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- $summary := index .Annotations "summary" -}}
|
||||
{{- $alertname := index .Labels "alertname" -}}
|
||||
{{- if or (eq $alertname "DatasourceNoData") (eq $alertname "DatasourceError") -}}
|
||||
{{- $summary = $alertname -}}
|
||||
{{- $alertname = index .Labels "rulename" -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- $instance := index .Labels "instance" -}}
|
||||
{{- $service := "" -}}
|
||||
{{- if index .Labels "service" -}}
|
||||
{{- $service = index .Labels "service" -}}
|
||||
{{- end -}}
|
||||
|
||||
{{ $status }} **{{ $instance }}**{{ "\n" }}
|
||||
{{- if $service -}}
|
||||
{{ $service }}: {{ $alertname }}{{ "\n" }}
|
||||
{{- else -}}
|
||||
{{ $alertname }}{{ "\n" }}
|
||||
{{- end -}}
|
||||
|
||||
{{- if eq $severity "disaster" -}}🚨 {{ $summary }}{{ "\n" }}
|
||||
{{- else if eq $severity "critical" -}}🚨 {{ $summary }}{{ "\n" }}
|
||||
{{- else if eq $severity "performance" -}}🐌 {{ $summary }}{{ "\n" }}
|
||||
{{- else -}}⚠️ {{ $summary }}{{ "\n" }}
|
||||
{{- end -}}
|
||||
|
||||
{{- $common_labels := coll.Dict "environment" "__ENV__" -}}
|
||||
{{- range $label, $value := $common_labels -}}
|
||||
`{{ $label }}: {{ $value }}`{{ "\n" }}
|
||||
{{- end -}}
|
||||
|
||||
{{- $has_labels := false -}}
|
||||
{{- range $label, $value := .Labels -}}
|
||||
{{- if and (ne $label "alertname") (ne $label "datasource_uid") (ne $label "grafana_folder") (ne $label "job")
|
||||
(ne $label "details") (ne $label "ref_id") (ne $label "rulename") (ne $label "instance")
|
||||
(ne $label "service") (ne $label "severity") (ne $label "dashboard_uid") (ne $label "panel_id")
|
||||
(ne $label "ip_version") (ne $label "ip") (ne $label "hostname") (ne $label "role") (ne $label "team")
|
||||
(ne $label "id") (ne $label "endpoint") (ne $label "hostgroup") (ne $label "module")
|
||||
(ne $label "servername") (ne $label "type") (ne $label "vm_project_id") (ne $label "vm_account_id") -}}
|
||||
{{- $has_labels = true -}}
|
||||
{{- break -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- if $has_labels -}}
|
||||
{{- range $label, $value := .Labels -}}
|
||||
{{- if and (ne $label "alertname") (ne $label "datasource_uid") (ne $label "grafana_folder") (ne $label "job")
|
||||
(ne $label "details") (ne $label "ref_id") (ne $label "rulename") (ne $label "instance")
|
||||
(ne $label "service") (ne $label "severity") (ne $label "dashboard_uid") (ne $label "panel_id")
|
||||
(ne $label "ip_version") (ne $label "ip") (ne $label "hostname") (ne $label "role") (ne $label "team")
|
||||
(ne $label "id") (ne $label "endpoint") (ne $label "hostgroup") (ne $label "module")
|
||||
(ne $label "servername") (ne $label "type") (ne $label "vm_project_id") (ne $label "vm_account_id") -}}
|
||||
`{{ $label }}: {{ $value }}`{{ "\n" }}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- if eq .Status "firing" -}}
|
||||
{{- "\n" -}}
|
||||
{{- template "print_links" . -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- define "print_alerts" -}}
|
||||
{{- range $index, $alert := . -}}
|
||||
{{- if ne $index 0 -}}
|
||||
{{ "---" }}{{ "\n" }}
|
||||
{{- end -}}
|
||||
{{- template "print_alert" $alert -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- with .Alerts -}}
|
||||
{{- if .Firing -}}
|
||||
**🔥 Firing Alerts**{{ "\n" }}
|
||||
{{ "---" }}{{ "\n" }}
|
||||
{{- template "print_alerts" .Firing -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- if and .Firing .Resolved -}}
|
||||
{{ "\n\n" }}
|
||||
{{- end -}}
|
||||
|
||||
{{- if .Resolved -}}
|
||||
**✅ Resolved Alerts**{{ "\n" }}
|
||||
{{ "---" }}{{ "\n" }}
|
||||
{{- template "print_alerts" .Resolved -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
@ -0,0 +1 @@
|
||||
{{ if .Alerts.Firing }}[FIRING:{{ len .Alerts.Firing }}]{{ else }}[RESOLVED]{{ end }} {{ (index .Alerts 0).Labels.alertname }} at {{ (index .Alerts 0).Labels.instance }}
|
||||
@ -0,0 +1,60 @@
|
||||
{{ define "custom_alert.title" }}[{{ .Status | toUpper }}{{ if eq .Status "firing" }}: {{ .Alerts.Firing | len }}{{ if gt (.Alerts.Resolved | len) 0 }}, RESOLVED: {{ .Alerts.Resolved | len }}{{ end }}{{ end }}]{{ if gt (len .GroupLabels) 0 }} Grouped by: {{ range .CommonLabels.SortedPairs }}{{ .Name }}: {{ .Value }}{{ end }}{{ end }}{{ end }}
|
||||
|
||||
{{ define "__text_alert_name" }}{{ range .Labels.SortedPairs }}{{ if eq .Name "alertname" }}{{ .Value }}{{ end }}{{ end }}{{ end }}
|
||||
{{ define "__text_alert_summary" }}{{ range .Annotations.SortedPairs }}{{ if eq .Name "summary" }}{{ .Value }}
|
||||
{{ end }}{{ end }}{{ end }}
|
||||
{{ define "__text_alert_description" }}{{ range .Annotations.SortedPairs }}{{ if eq .Name "description" }}{{ .Value }}{{ end }}{{ end }}{{ end }}
|
||||
{{ define "__text_alert_runbook_url" }}{{ range .Annotations.SortedPairs }}{{ if eq .Name "runbook_url" }}
|
||||
:bookmark_tabs: <{{ .Value }}|Playbook>{{ end }}{{ end }}{{ end }}
|
||||
|
||||
{{ define "__text_alert_firing_item" }}:bell:
|
||||
{{ $alertName := (index .Labels "alertname") }}
|
||||
Labels: {{ range .Labels.SortedPairs }}
|
||||
{{- if ne .Name "alertname" }}
|
||||
{{- if ne .Name "ref_id" }}
|
||||
{{- if ne .Name "datasource_uid" }}
|
||||
{{- if ne .Name "rule_uid" }}
|
||||
- {{ if and (eq .Name "cluster") (eq $alertName "Kube-apiserver or blackbox is down") }}request_from_cluster{{ else }}{{ .Name }}{{ end }} = {{ .Value }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
||||
Actions:
|
||||
{{ if gt (len .DashboardURL) 0 }}➡️ <{{ .DashboardURL }}|Go to dashboard>{{ end }}
|
||||
{{ if gt (len .PanelURL) 0 }}:chart_with_upwards_trend: <{{ .PanelURL }}|Go to panel>{{ end }}
|
||||
{{ if gt (len .GeneratorURL) 0 }}:arrow_right: <{{ .GeneratorURL }}|Go to alert>{{ end }}
|
||||
{{ if gt (len .SilenceURL) 0 }}:mute: <{{ .SilenceURL }}|Silence alert>{{ end }}{{ template "__text_alert_runbook_url" . }}{{ end }}
|
||||
|
||||
{{ define "__text_alert_resolved_item" }}:large_green_circle: {{ template "__text_alert_name" . }}
|
||||
{{ $alertName := (index .Labels "alertname") }}
|
||||
Labels: {{ range .Labels.SortedPairs }}
|
||||
{{- if ne .Name "alertname" }}
|
||||
{{- if ne .Name "ref_id" }}
|
||||
{{- if ne .Name "datasource_uid" }}
|
||||
{{- if ne .Name "rule_uid" }}
|
||||
- {{ if and (eq .Name "cluster") (eq $alertName "Kube-apiserver or blackbox is down") }}request_from_cluster{{ else }}{{ .Name }}{{ end }} = {{ .Value }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
||||
Actions:
|
||||
{{ if gt (len .DashboardURL) 0 }}➡️ <{{ .DashboardURL }}|Go to dashboard>{{ end }}
|
||||
{{ if gt (len .PanelURL) 0 }}:chart_with_upwards_trend: <{{ .PanelURL }}|Go to panel>{{ end }}
|
||||
{{ if gt (len .GeneratorURL) 0 }}:arrow_right: <{{ .GeneratorURL }}|Go to alert>{{ end }}{{ end }}
|
||||
|
||||
{{ define "__text_alert_list_firing" }}{{ range . }}
|
||||
|
||||
{{ template "__text_alert_firing_item" . }}{{ end }}{{ end }}
|
||||
|
||||
{{ define "__text_alert_list_resolved" }}{{ range . }}
|
||||
|
||||
{{ template "__text_alert_resolved_item" . }}{{ end }}{{ end }}
|
||||
|
||||
{{ define "custom_alert.message" }}
|
||||
{{ if gt (len .Alerts.Firing) 0 }}{{ .Alerts.Firing | len }} Firing{{ template "__text_alert_list_firing" .Alerts.Firing }}{{ end }}
|
||||
|
||||
{{ if gt (len .Alerts.Resolved) 0 }}{{ .Alerts.Resolved | len }} Resolved{{ template "__text_alert_list_resolved" .Alerts.Resolved }}{{ end }}{{ end }}
|
||||
@ -0,0 +1,70 @@
|
||||
{{ define "telegram_message" }}
|
||||
{{ if gt (len .Alerts.Firing) 0 }}
|
||||
<b>🔥 {{ len .Alerts.Firing }} alert(s) firing:</b>
|
||||
{{ range .Alerts.Firing }} {{ template "telegram_alert_firing" .}} {{ end }} {{ end }}
|
||||
{{ if gt (len .Alerts.Resolved) 0 }}
|
||||
<b>✅ {{ len .Alerts.Resolved }} alert(s) resolved:</b>
|
||||
{{ range .Alerts.Resolved }} {{ template "telegram_alert_resolved" .}} {{ end }} {{ end }}
|
||||
{{ end }}
|
||||
{{ define "telegram_alert_firing" }}
|
||||
<b>Value = </b>
|
||||
{{- $value := .Values }}
|
||||
{{- if ne (printf "%.1f" $value.A) "0.0" }}
|
||||
{{- printf "%.1f" $value.A }}{{- end }}
|
||||
{{- if ne (printf "%.1f" $value.B) "0.0" }} {{- printf "%.1f" $value.B }}{{- end }}
|
||||
<b>Labels:</b>
|
||||
{{- if index .Labels "name" }}
|
||||
- Name = {{ index .Labels "name" }}
|
||||
{{- end }}
|
||||
{{- if index .Labels "instance" }}
|
||||
- IP = {{ index .Labels "instance" }}
|
||||
{{- end }}
|
||||
{{- if index .Labels "Severity" }}
|
||||
- Severity = {{ index .Labels "Severity" }}
|
||||
{{- end }}
|
||||
{{- if index .Labels "grafana_folder" }}
|
||||
- Grafana_folder = {{ index .Labels "grafana_folder" }}
|
||||
{{- end }}
|
||||
{{- if index .Labels "volume" }}
|
||||
- Volume = {{ index .Labels "volume" }}
|
||||
{{- end }}
|
||||
{{- if index .Labels "mountpoint" }}
|
||||
- Mountpoint = {{ index .Labels "mountpoint" }}
|
||||
{{- end }}
|
||||
{{- if index .Labels "job" }}
|
||||
- OS = {{ index .Labels "job" }}
|
||||
{{- end }}
|
||||
{{- if index .Labels "loc" }}
|
||||
- Location = {{ index .Labels "loc" }}
|
||||
{{- end }}
|
||||
{{ end }}
|
||||
{{ define "telegram_alert_resolved" }}
|
||||
<b>{{ .Labels.alertname }}</b>
|
||||
Node: <b>{{ .Annotations.Node }}</b>
|
||||
{{ .Annotations.AlertValues }}
|
||||
<b>Labels:</b>
|
||||
{{- if index .Labels "name" }}
|
||||
- Name = {{ index .Labels "name" }}
|
||||
{{- end }}
|
||||
{{- if index .Labels "instance" }}
|
||||
- IP = {{ index .Labels "instance" }}
|
||||
{{- end }}
|
||||
{{- if index .Labels "Severity" }}
|
||||
- Severity = {{ index .Labels "Severity" }}
|
||||
{{- end }}
|
||||
{{- if index .Labels "grafana_folder" }}
|
||||
- Grafana_folder = {{ index .Labels "grafana_folder" }}
|
||||
{{- end }}
|
||||
{{- if index .Labels "volume" }}
|
||||
- Volume = {{ index .Labels "volume" }}
|
||||
{{- end }}
|
||||
{{- if index .Labels "mountpoint" }}
|
||||
- Mountpoint = {{ index .Labels "mountpoint" }}
|
||||
{{- end }}
|
||||
{{- if index .Labels "job" }}
|
||||
- OS = {{ index .Labels "job" }}
|
||||
{{- end }}
|
||||
{{- if index .Labels "loc" }}
|
||||
- Location = {{ index .Labels "loc" }}
|
||||
{{- end }}
|
||||
{{ end }}
|
||||
29
environments/modules/grafana_contact_points/variables.tf
Normal file
29
environments/modules/grafana_contact_points/variables.tf
Normal file
@ -0,0 +1,29 @@
|
||||
variable "contact_points" {
|
||||
description = "List of contact points for Grafana alerts"
|
||||
type = list(object({
|
||||
name = string
|
||||
type = string
|
||||
settings = map(string)
|
||||
}))
|
||||
}
|
||||
|
||||
variable "disable_provenance" {
|
||||
description = "Controls whether Grafana provisioning is disabled"
|
||||
type = bool
|
||||
default = true
|
||||
}
|
||||
|
||||
variable "org_id" {
|
||||
description = "ID of the Grafana organization"
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "grafana_url" {
|
||||
description = "Grafana URL"
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "env" {
|
||||
description = "Grafana environment"
|
||||
type = string
|
||||
}
|
||||
7
environments/modules/grafana_contact_points/versions.tf
Normal file
7
environments/modules/grafana_contact_points/versions.tf
Normal file
@ -0,0 +1,7 @@
|
||||
terraform {
|
||||
required_providers {
|
||||
grafana = {
|
||||
source = "grafana/grafana"
|
||||
}
|
||||
}
|
||||
}
|
||||
9
environments/modules/grafana_dashboard/outputs.tf
Normal file
9
environments/modules/grafana_dashboard/outputs.tf
Normal file
@ -0,0 +1,9 @@
|
||||
output "dashboard_ids" {
|
||||
description = "IDs of the created Grafana dashboards"
|
||||
value = merge(
|
||||
{ for name, dashboard in grafana_dashboard.dashboards_ignore_and_protect : name => dashboard.id },
|
||||
{ for name, dashboard in grafana_dashboard.dashboards_ignore_only : name => dashboard.id },
|
||||
{ for name, dashboard in grafana_dashboard.dashboards_protect_only : name => dashboard.id },
|
||||
{ for name, dashboard in grafana_dashboard.dashboards_standard : name => dashboard.id }
|
||||
)
|
||||
}
|
||||
7
environments/modules/grafana_dashboard/versions.tf
Normal file
7
environments/modules/grafana_dashboard/versions.tf
Normal file
@ -0,0 +1,7 @@
|
||||
terraform {
|
||||
required_providers {
|
||||
grafana = {
|
||||
source = "grafana/grafana"
|
||||
}
|
||||
}
|
||||
}
|
||||
22
environments/modules/grafana_dashboard_folder/locals.tf
Normal file
22
environments/modules/grafana_dashboard_folder/locals.tf
Normal file
@ -0,0 +1,22 @@
|
||||
locals {
|
||||
# Group folders by whether they should keep manual changes or be protected from destruction
|
||||
folders_ignore_only = {
|
||||
for group in var.groups : group.dashboard_alert_group_name => group
|
||||
if lookup(group, "keep_manual_changes", false) && !lookup(group, "prevent_destroy_on_recreate", false)
|
||||
}
|
||||
|
||||
folders_protect_only = {
|
||||
for group in var.groups : group.dashboard_alert_group_name => group
|
||||
if !lookup(group, "keep_manual_changes", false) && lookup(group, "prevent_destroy_on_recreate", false)
|
||||
}
|
||||
|
||||
folders_ignore_and_protect = {
|
||||
for group in var.groups : group.dashboard_alert_group_name => group
|
||||
if lookup(group, "keep_manual_changes", false) && lookup(group, "prevent_destroy_on_recreate", false)
|
||||
}
|
||||
|
||||
folders_standard = {
|
||||
for group in var.groups : group.dashboard_alert_group_name => group
|
||||
if !lookup(group, "keep_manual_changes", false) && !lookup(group, "prevent_destroy_on_recreate", false)
|
||||
}
|
||||
}
|
||||
44
environments/modules/grafana_dashboard_folder/main.tf
Normal file
44
environments/modules/grafana_dashboard_folder/main.tf
Normal file
@ -0,0 +1,44 @@
|
||||
# Folders with both manual changes allowed and destroy protection
|
||||
resource "grafana_folder" "folders_ignore_and_protect" {
|
||||
for_each = local.folders_ignore_and_protect
|
||||
|
||||
title = each.value.dashboard_alert_group_name
|
||||
org_id = var.org_id
|
||||
|
||||
lifecycle {
|
||||
ignore_changes = [title]
|
||||
prevent_destroy = true
|
||||
}
|
||||
}
|
||||
|
||||
# Folders with only manual changes allowed
|
||||
resource "grafana_folder" "folders_ignore_only" {
|
||||
for_each = local.folders_ignore_only
|
||||
|
||||
title = each.value.dashboard_alert_group_name
|
||||
org_id = var.org_id
|
||||
|
||||
lifecycle {
|
||||
ignore_changes = [title]
|
||||
}
|
||||
}
|
||||
|
||||
# Folders with only destroy protection
|
||||
resource "grafana_folder" "folders_protect_only" {
|
||||
for_each = local.folders_protect_only
|
||||
|
||||
title = each.value.dashboard_alert_group_name
|
||||
org_id = var.org_id
|
||||
|
||||
lifecycle {
|
||||
prevent_destroy = true
|
||||
}
|
||||
}
|
||||
|
||||
# Standard folders without any special lifecycle management
|
||||
resource "grafana_folder" "folders_standard" {
|
||||
for_each = local.folders_standard
|
||||
|
||||
title = each.value.dashboard_alert_group_name
|
||||
org_id = var.org_id
|
||||
}
|
||||
21
environments/modules/grafana_dashboard_folder/outputs.tf
Normal file
21
environments/modules/grafana_dashboard_folder/outputs.tf
Normal file
@ -0,0 +1,21 @@
|
||||
# Output for mapping of alert group names to folder IDs
|
||||
output "folder_ids" {
|
||||
description = "Mapping of alert group names to their folder IDs in Grafana"
|
||||
value = merge(
|
||||
{ for group_name, folder in grafana_folder.folders_ignore_and_protect : group_name => folder.id },
|
||||
{ for group_name, folder in grafana_folder.folders_ignore_only : group_name => folder.id },
|
||||
{ for group_name, folder in grafana_folder.folders_protect_only : group_name => folder.id },
|
||||
{ for group_name, folder in grafana_folder.folders_standard : group_name => folder.id }
|
||||
)
|
||||
}
|
||||
|
||||
# Output for mapping of alert group names to folder UIDs
|
||||
output "folder_uids" {
|
||||
description = "Mapping of alert group names to their folder UIDs in Grafana"
|
||||
value = merge(
|
||||
{ for group_name, folder in grafana_folder.folders_ignore_and_protect : group_name => folder.uid },
|
||||
{ for group_name, folder in grafana_folder.folders_ignore_only : group_name => folder.uid },
|
||||
{ for group_name, folder in grafana_folder.folders_protect_only : group_name => folder.uid },
|
||||
{ for group_name, folder in grafana_folder.folders_standard : group_name => folder.uid }
|
||||
)
|
||||
}
|
||||
16
environments/modules/grafana_dashboard_folder/variables.tf
Normal file
16
environments/modules/grafana_dashboard_folder/variables.tf
Normal file
@ -0,0 +1,16 @@
|
||||
variable "groups" {
|
||||
description = "List of alert groups with their definitions"
|
||||
type = list(object({
|
||||
dashboard_alert_group_name = string
|
||||
alert_definitions_path = string
|
||||
dashboard_path_if_exist = optional(string, null)
|
||||
keep_manual_changes = optional(bool, false)
|
||||
prevent_destroy_on_recreate = optional(bool, false)
|
||||
alerts_on_datasources_uid = list(string)
|
||||
}))
|
||||
}
|
||||
|
||||
variable "org_id" {
|
||||
description = "ID of the Grafana organization"
|
||||
type = string
|
||||
}
|
||||
@ -0,0 +1,7 @@
|
||||
terraform {
|
||||
required_providers {
|
||||
grafana = {
|
||||
source = "grafana/grafana"
|
||||
}
|
||||
}
|
||||
}
|
||||
49
environments/modules/grafana_dashboard_folder_xt5/locals.tf
Normal file
49
environments/modules/grafana_dashboard_folder_xt5/locals.tf
Normal file
@ -0,0 +1,49 @@
|
||||
locals {
|
||||
# Create a map of groups by dashboard_alert_group_name for parent lookup
|
||||
groups_by_name = {
|
||||
for group in var.groups : group.dashboard_alert_group_name => group
|
||||
}
|
||||
|
||||
# Helper function to extract parent folder name and child folder name
|
||||
# If dashboard_alert_group_name contains "/", split it into parent and child
|
||||
folder_structure = {
|
||||
for group in var.groups : group.dashboard_alert_group_name => {
|
||||
group = group
|
||||
parts = split("/", group.dashboard_alert_group_name)
|
||||
has_parent = length(split("/", group.dashboard_alert_group_name)) > 1
|
||||
parent_folder_name = length(split("/", group.dashboard_alert_group_name)) > 1 ? join("/", slice(split("/", group.dashboard_alert_group_name), 0, length(split("/", group.dashboard_alert_group_name)) - 1)) : null
|
||||
folder_title = length(split("/", group.dashboard_alert_group_name)) > 1 ? element(split("/", group.dashboard_alert_group_name), length(split("/", group.dashboard_alert_group_name)) - 1) : group.dashboard_alert_group_name
|
||||
# Get parent folder UID from parent group's folder_uid
|
||||
parent_folder_uid = length(split("/", group.dashboard_alert_group_name)) > 1 ? try(local.groups_by_name[join("/", slice(split("/", group.dashboard_alert_group_name), 0, length(split("/", group.dashboard_alert_group_name)) - 1))].folder_uid, null) : null
|
||||
}
|
||||
}
|
||||
|
||||
# Group folders by whether they should keep manual changes or be protected from destruction
|
||||
folders_ignore_only = {
|
||||
for group_name, folder_info in local.folder_structure : group_name => folder_info
|
||||
if lookup(folder_info.group, "keep_manual_changes", false) && !lookup(folder_info.group, "prevent_destroy_on_recreate", false)
|
||||
}
|
||||
|
||||
folders_protect_only = {
|
||||
for group_name, folder_info in local.folder_structure : group_name => folder_info
|
||||
if !lookup(folder_info.group, "keep_manual_changes", false) && lookup(folder_info.group, "prevent_destroy_on_recreate", false)
|
||||
}
|
||||
|
||||
folders_ignore_and_protect = {
|
||||
for group_name, folder_info in local.folder_structure : group_name => folder_info
|
||||
if lookup(folder_info.group, "keep_manual_changes", false) && lookup(folder_info.group, "prevent_destroy_on_recreate", false)
|
||||
}
|
||||
|
||||
folders_standard = {
|
||||
for group_name, folder_info in local.folder_structure : group_name => folder_info
|
||||
if !lookup(folder_info.group, "keep_manual_changes", false) && !lookup(folder_info.group, "prevent_destroy_on_recreate", false)
|
||||
}
|
||||
|
||||
# Create a map of all folders for parent UID lookup
|
||||
all_folders = merge(
|
||||
local.folders_ignore_and_protect,
|
||||
local.folders_ignore_only,
|
||||
local.folders_protect_only,
|
||||
local.folders_standard
|
||||
)
|
||||
}
|
||||
51
environments/modules/grafana_dashboard_folder_xt5/main.tf
Normal file
51
environments/modules/grafana_dashboard_folder_xt5/main.tf
Normal file
@ -0,0 +1,51 @@
|
||||
# Folders with both manual changes allowed and destroy protection
|
||||
# Note: Currently, Terraform Grafana provider doesn't support nested folders directly.
|
||||
# Folders will be created as flat structure. Nested structure can be configured
|
||||
# manually in Grafana UI or via API after creation.
|
||||
resource "grafana_folder" "folders_ignore_and_protect" {
|
||||
for_each = local.folders_ignore_and_protect
|
||||
|
||||
title = each.value.folder_title
|
||||
uid = each.value.group.folder_uid
|
||||
org_id = var.org_id
|
||||
|
||||
lifecycle {
|
||||
ignore_changes = [title]
|
||||
prevent_destroy = true
|
||||
}
|
||||
}
|
||||
|
||||
# Folders with only manual changes allowed
|
||||
resource "grafana_folder" "folders_ignore_only" {
|
||||
for_each = local.folders_ignore_only
|
||||
|
||||
title = each.value.folder_title
|
||||
uid = each.value.group.folder_uid
|
||||
org_id = var.org_id
|
||||
|
||||
lifecycle {
|
||||
ignore_changes = [title]
|
||||
}
|
||||
}
|
||||
|
||||
# Folders with only destroy protection
|
||||
resource "grafana_folder" "folders_protect_only" {
|
||||
for_each = local.folders_protect_only
|
||||
|
||||
title = each.value.folder_title
|
||||
uid = each.value.group.folder_uid
|
||||
org_id = var.org_id
|
||||
|
||||
lifecycle {
|
||||
prevent_destroy = true
|
||||
}
|
||||
}
|
||||
|
||||
# Standard folders without any special lifecycle management
|
||||
resource "grafana_folder" "folders_standard" {
|
||||
for_each = local.folders_standard
|
||||
|
||||
title = each.value.folder_title
|
||||
uid = each.value.group.folder_uid
|
||||
org_id = var.org_id
|
||||
}
|
||||
21
environments/modules/grafana_dashboard_folder_xt5/outputs.tf
Normal file
21
environments/modules/grafana_dashboard_folder_xt5/outputs.tf
Normal file
@ -0,0 +1,21 @@
|
||||
# Output for mapping of alert group names to folder IDs
|
||||
output "folder_ids" {
|
||||
description = "Mapping of alert group names to their folder IDs in Grafana"
|
||||
value = merge(
|
||||
{ for group_name, folder in grafana_folder.folders_ignore_and_protect : group_name => folder.id },
|
||||
{ for group_name, folder in grafana_folder.folders_ignore_only : group_name => folder.id },
|
||||
{ for group_name, folder in grafana_folder.folders_protect_only : group_name => folder.id },
|
||||
{ for group_name, folder in grafana_folder.folders_standard : group_name => folder.id }
|
||||
)
|
||||
}
|
||||
|
||||
# Output for mapping of alert group names to folder UIDs
|
||||
output "folder_uids" {
|
||||
description = "Mapping of alert group names to their folder UIDs in Grafana"
|
||||
value = merge(
|
||||
{ for group_name, folder in grafana_folder.folders_ignore_and_protect : group_name => folder.uid },
|
||||
{ for group_name, folder in grafana_folder.folders_ignore_only : group_name => folder.uid },
|
||||
{ for group_name, folder in grafana_folder.folders_protect_only : group_name => folder.uid },
|
||||
{ for group_name, folder in grafana_folder.folders_standard : group_name => folder.uid }
|
||||
)
|
||||
}
|
||||
@ -0,0 +1,17 @@
|
||||
variable "groups" {
|
||||
description = "List of alert groups with their definitions"
|
||||
type = list(object({
|
||||
dashboard_alert_group_name = string
|
||||
folder_uid = string
|
||||
alert_definitions_path = string
|
||||
dashboard_path_if_exist = optional(string, null)
|
||||
keep_manual_changes = optional(bool, false)
|
||||
prevent_destroy_on_recreate = optional(bool, false)
|
||||
alerts_on_datasources_uid = list(string)
|
||||
}))
|
||||
}
|
||||
|
||||
variable "org_id" {
|
||||
description = "ID of the Grafana organization"
|
||||
type = string
|
||||
}
|
||||
@ -0,0 +1,7 @@
|
||||
terraform {
|
||||
required_providers {
|
||||
grafana = {
|
||||
source = "grafana/grafana"
|
||||
}
|
||||
}
|
||||
}
|
||||
53
environments/modules/grafana_dashboard_xt5/locals.tf
Normal file
53
environments/modules/grafana_dashboard_xt5/locals.tf
Normal file
@ -0,0 +1,53 @@
|
||||
locals {
|
||||
# Dashboards with both manual changes allowed and destroy protection
|
||||
dashboards_ignore_and_protect = flatten([
|
||||
for group in var.groups : [
|
||||
for file in(group.dashboard_path_if_exist != null ? fileset(group.dashboard_path_if_exist, "*.json") : []) : {
|
||||
group_name = group.dashboard_alert_group_name
|
||||
file_path = "${group.dashboard_path_if_exist}/${file}"
|
||||
folder_id = lookup(var.folder_ids, group.dashboard_alert_group_name, null)
|
||||
keep_manual_changes = lookup(group, "keep_manual_changes", false)
|
||||
prevent_destroy_on_recreate = lookup(group, "prevent_destroy_on_recreate", false)
|
||||
}
|
||||
] if lookup(group, "keep_manual_changes", false) && lookup(group, "prevent_destroy_on_recreate", false)
|
||||
])
|
||||
|
||||
# Dashboards with only manual changes allowed
|
||||
dashboards_ignore_only = flatten([
|
||||
for group in var.groups : [
|
||||
for file in(group.dashboard_path_if_exist != null ? fileset(group.dashboard_path_if_exist, "*.json") : []) : {
|
||||
group_name = group.dashboard_alert_group_name
|
||||
file_path = "${group.dashboard_path_if_exist}/${file}"
|
||||
folder_id = lookup(var.folder_ids, group.dashboard_alert_group_name, null)
|
||||
keep_manual_changes = lookup(group, "keep_manual_changes", false)
|
||||
prevent_destroy_on_recreate = lookup(group, "prevent_destroy_on_recreate", false)
|
||||
}
|
||||
] if lookup(group, "keep_manual_changes", false) && !lookup(group, "prevent_destroy_on_recreate", false)
|
||||
])
|
||||
|
||||
# Dashboards with only destroy protection
|
||||
dashboards_protect_only = flatten([
|
||||
for group in var.groups : [
|
||||
for file in(group.dashboard_path_if_exist != null ? fileset(group.dashboard_path_if_exist, "*.json") : []) : {
|
||||
group_name = group.dashboard_alert_group_name
|
||||
file_path = "${group.dashboard_path_if_exist}/${file}"
|
||||
folder_id = lookup(var.folder_ids, group.dashboard_alert_group_name, null)
|
||||
keep_manual_changes = lookup(group, "keep_manual_changes", false)
|
||||
prevent_destroy_on_recreate = lookup(group, "prevent_destroy_on_recreate", false)
|
||||
}
|
||||
] if !lookup(group, "keep_manual_changes", false) && lookup(group, "prevent_destroy_on_recreate", false)
|
||||
])
|
||||
|
||||
# Standard dashboards without any special lifecycle management
|
||||
dashboards_standard = flatten([
|
||||
for group in var.groups : [
|
||||
for file in(group.dashboard_path_if_exist != null ? fileset(group.dashboard_path_if_exist, "*.json") : []) : {
|
||||
group_name = group.dashboard_alert_group_name
|
||||
file_path = "${group.dashboard_path_if_exist}/${file}"
|
||||
folder_id = lookup(var.folder_ids, group.dashboard_alert_group_name, null)
|
||||
keep_manual_changes = lookup(group, "keep_manual_changes", false)
|
||||
prevent_destroy_on_recreate = lookup(group, "prevent_destroy_on_recreate", false)
|
||||
}
|
||||
] if !lookup(group, "keep_manual_changes", false) && !lookup(group, "prevent_destroy_on_recreate", false)
|
||||
])
|
||||
}
|
||||
52
environments/modules/grafana_dashboard_xt5/main.tf
Normal file
52
environments/modules/grafana_dashboard_xt5/main.tf
Normal file
@ -0,0 +1,52 @@
|
||||
# Dashboards with both manual changes allowed and destroy protection
|
||||
resource "grafana_dashboard" "dashboards_ignore_and_protect" {
|
||||
for_each = { for d in local.dashboards_ignore_and_protect : d.file_path => d }
|
||||
|
||||
config_json = file(each.value.file_path)
|
||||
folder = each.value.folder_id
|
||||
org_id = var.org_id
|
||||
overwrite = true
|
||||
|
||||
lifecycle {
|
||||
prevent_destroy = true
|
||||
ignore_changes = [config_json]
|
||||
}
|
||||
}
|
||||
|
||||
# Dashboards with only manual changes allowed
|
||||
resource "grafana_dashboard" "dashboards_ignore_only" {
|
||||
for_each = { for d in local.dashboards_ignore_only : d.file_path => d }
|
||||
|
||||
config_json = file(each.value.file_path)
|
||||
folder = each.value.folder_id
|
||||
org_id = var.org_id
|
||||
overwrite = true
|
||||
|
||||
lifecycle {
|
||||
ignore_changes = [config_json]
|
||||
}
|
||||
}
|
||||
|
||||
# Dashboards with only destroy protection
|
||||
resource "grafana_dashboard" "dashboards_protect_only" {
|
||||
for_each = { for d in local.dashboards_protect_only : d.file_path => d }
|
||||
|
||||
config_json = file(each.value.file_path)
|
||||
folder = each.value.folder_id
|
||||
org_id = var.org_id
|
||||
overwrite = true
|
||||
|
||||
lifecycle {
|
||||
prevent_destroy = true
|
||||
}
|
||||
}
|
||||
|
||||
# Standard dashboards without any special lifecycle management
|
||||
resource "grafana_dashboard" "dashboards_standard" {
|
||||
for_each = { for d in local.dashboards_standard : d.file_path => d }
|
||||
|
||||
config_json = file(each.value.file_path)
|
||||
folder = each.value.folder_id
|
||||
org_id = var.org_id
|
||||
overwrite = true
|
||||
}
|
||||
9
environments/modules/grafana_dashboard_xt5/outputs.tf
Normal file
9
environments/modules/grafana_dashboard_xt5/outputs.tf
Normal file
@ -0,0 +1,9 @@
|
||||
output "dashboard_ids" {
|
||||
description = "IDs of the created Grafana dashboards"
|
||||
value = merge(
|
||||
{ for name, dashboard in grafana_dashboard.dashboards_ignore_and_protect : name => dashboard.id },
|
||||
{ for name, dashboard in grafana_dashboard.dashboards_ignore_only : name => dashboard.id },
|
||||
{ for name, dashboard in grafana_dashboard.dashboards_protect_only : name => dashboard.id },
|
||||
{ for name, dashboard in grafana_dashboard.dashboards_standard : name => dashboard.id }
|
||||
)
|
||||
}
|
||||
21
environments/modules/grafana_dashboard_xt5/variables.tf
Normal file
21
environments/modules/grafana_dashboard_xt5/variables.tf
Normal file
@ -0,0 +1,21 @@
|
||||
variable "org_id" {
|
||||
description = "ID of the organization for dashboards"
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "groups" {
|
||||
description = "List of alert groups with their definitions and data sources"
|
||||
type = list(object({
|
||||
dashboard_alert_group_name = string
|
||||
alert_definitions_path = string
|
||||
dashboard_path_if_exist = optional(string, null)
|
||||
keep_manual_changes = optional(bool, false)
|
||||
prevent_destroy_on_recreate = optional(bool, false)
|
||||
alerts_on_datasources_uid = list(string)
|
||||
}))
|
||||
}
|
||||
|
||||
variable "folder_ids" {
|
||||
description = "Mapping of folder IDs for each alert group"
|
||||
type = map(string)
|
||||
}
|
||||
7
environments/modules/grafana_dashboard_xt5/versions.tf
Normal file
7
environments/modules/grafana_dashboard_xt5/versions.tf
Normal file
@ -0,0 +1,7 @@
|
||||
terraform {
|
||||
required_providers {
|
||||
grafana = {
|
||||
source = "grafana/grafana"
|
||||
}
|
||||
}
|
||||
}
|
||||
25
environments/modules/grafana_datasource/locals.tf
Normal file
25
environments/modules/grafana_datasource/locals.tf
Normal file
@ -0,0 +1,25 @@
|
||||
locals {
|
||||
# Data sources with both manual changes allowed and destroy protection
|
||||
datasources_ignore_and_protect = {
|
||||
for ds in var.datasources : ds.name => ds
|
||||
if lookup(ds, "keep_manual_changes", false) && lookup(ds, "prevent_destroy_on_recreate", false)
|
||||
}
|
||||
|
||||
# Data sources with only manual changes allowed
|
||||
datasources_ignore_only = {
|
||||
for ds in var.datasources : ds.name => ds
|
||||
if lookup(ds, "keep_manual_changes", false) && !lookup(ds, "prevent_destroy_on_recreate", false)
|
||||
}
|
||||
|
||||
# Data sources with only destroy protection enabled
|
||||
datasources_protect_only = {
|
||||
for ds in var.datasources : ds.name => ds
|
||||
if !lookup(ds, "keep_manual_changes", false) && lookup(ds, "prevent_destroy_on_recreate", false)
|
||||
}
|
||||
|
||||
# Standard data sources without any special lifecycle management
|
||||
datasources_standard = {
|
||||
for ds in var.datasources : ds.name => ds
|
||||
if !lookup(ds, "keep_manual_changes", false) && !lookup(ds, "prevent_destroy_on_recreate", false)
|
||||
}
|
||||
}
|
||||
173
environments/modules/grafana_datasource/main.tf
Normal file
173
environments/modules/grafana_datasource/main.tf
Normal file
@ -0,0 +1,173 @@
|
||||
resource "grafana_data_source" "datasources_ignore_and_protect" {
|
||||
for_each = local.datasources_ignore_and_protect
|
||||
|
||||
name = each.value.name
|
||||
access_mode = each.value.access_mode
|
||||
type = each.value.type
|
||||
uid = each.value.uid
|
||||
url = lookup(each.value, "url", null)
|
||||
username = lookup(each.value, "username", null)
|
||||
is_default = each.value.is_default
|
||||
org_id = var.org_id
|
||||
http_headers = each.value.http_headers
|
||||
|
||||
json_data_encoded = jsonencode(
|
||||
merge(
|
||||
each.value.json_data,
|
||||
{
|
||||
basicAuth = lookup(each.value, "basic_auth", false),
|
||||
basicAuthUser = lookup(each.value, "basic_auth_user", null),
|
||||
protocol = lookup(each.value.json_data, "protocol", "http"),
|
||||
database = lookup(each.value.json_data, "database", null),
|
||||
maxOpenConns = tonumber(lookup(each.value.json_data, "maxOpenConns", null)),
|
||||
maxIdleConns = tonumber(lookup(each.value.json_data, "maxIdleConns", null)),
|
||||
maxIdleConnsAuto = lookup(each.value.json_data, "maxIdleConnsAuto", null),
|
||||
connMaxLifetime = tonumber(lookup(each.value.json_data, "connMaxLifetime", null))
|
||||
}
|
||||
)
|
||||
)
|
||||
|
||||
secure_json_data_encoded = jsonencode(
|
||||
merge(
|
||||
each.value.secure_json_data,
|
||||
{
|
||||
basicAuthPassword = lookup(each.value.secure_json_data, "basicAuthPassword", null),
|
||||
password = lookup(each.value.secure_json_data, "password", null)
|
||||
}
|
||||
)
|
||||
)
|
||||
|
||||
lifecycle {
|
||||
ignore_changes = [name, url, access_mode, is_default, json_data_encoded, secure_json_data_encoded]
|
||||
prevent_destroy = true
|
||||
}
|
||||
}
|
||||
|
||||
resource "grafana_data_source" "datasources_ignore_only" {
|
||||
for_each = local.datasources_ignore_only
|
||||
|
||||
name = each.value.name
|
||||
access_mode = each.value.access_mode
|
||||
type = each.value.type
|
||||
uid = each.value.uid
|
||||
url = lookup(each.value, "url", null)
|
||||
username = lookup(each.value, "username", null)
|
||||
is_default = each.value.is_default
|
||||
org_id = var.org_id
|
||||
http_headers = each.value.http_headers
|
||||
|
||||
json_data_encoded = jsonencode(
|
||||
merge(
|
||||
each.value.json_data,
|
||||
{
|
||||
basicAuth = lookup(each.value, "basic_auth", false),
|
||||
basicAuthUser = lookup(each.value, "basic_auth_user", null),
|
||||
protocol = lookup(each.value.json_data, "protocol", "http"),
|
||||
database = lookup(each.value.json_data, "database", null),
|
||||
maxOpenConns = tonumber(lookup(each.value.json_data, "maxOpenConns", null)),
|
||||
maxIdleConns = tonumber(lookup(each.value.json_data, "maxIdleConns", null)),
|
||||
maxIdleConnsAuto = lookup(each.value.json_data, "maxIdleConnsAuto", null),
|
||||
connMaxLifetime = tonumber(lookup(each.value.json_data, "connMaxLifetime", null))
|
||||
}
|
||||
)
|
||||
)
|
||||
|
||||
secure_json_data_encoded = jsonencode(
|
||||
merge(
|
||||
each.value.secure_json_data,
|
||||
{
|
||||
basicAuthPassword = lookup(each.value.secure_json_data, "basicAuthPassword", null),
|
||||
password = lookup(each.value.secure_json_data, "password", null)
|
||||
}
|
||||
)
|
||||
)
|
||||
|
||||
lifecycle {
|
||||
ignore_changes = [name, url, access_mode, is_default, json_data_encoded, secure_json_data_encoded]
|
||||
}
|
||||
}
|
||||
|
||||
resource "grafana_data_source" "datasources_protect_only" {
|
||||
for_each = local.datasources_protect_only
|
||||
|
||||
name = each.value.name
|
||||
access_mode = each.value.access_mode
|
||||
type = each.value.type
|
||||
uid = each.value.uid
|
||||
url = lookup(each.value, "url", null)
|
||||
username = lookup(each.value, "username", null)
|
||||
is_default = each.value.is_default
|
||||
org_id = var.org_id
|
||||
http_headers = each.value.http_headers
|
||||
|
||||
json_data_encoded = jsonencode(
|
||||
merge(
|
||||
each.value.json_data,
|
||||
{
|
||||
basicAuth = lookup(each.value, "basic_auth", false),
|
||||
basicAuthUser = lookup(each.value, "basic_auth_user", null),
|
||||
protocol = lookup(each.value.json_data, "protocol", "http"),
|
||||
database = lookup(each.value.json_data, "database", null),
|
||||
maxOpenConns = tonumber(lookup(each.value.json_data, "maxOpenConns", null)),
|
||||
maxIdleConns = tonumber(lookup(each.value.json_data, "maxIdleConns", null)),
|
||||
maxIdleConnsAuto = lookup(each.value.json_data, "maxIdleConnsAuto", null),
|
||||
connMaxLifetime = tonumber(lookup(each.value.json_data, "connMaxLifetime", null))
|
||||
}
|
||||
)
|
||||
)
|
||||
|
||||
secure_json_data_encoded = jsonencode(
|
||||
merge(
|
||||
each.value.secure_json_data,
|
||||
{
|
||||
basicAuthPassword = lookup(each.value.secure_json_data, "basicAuthPassword", null),
|
||||
password = lookup(each.value.secure_json_data, "password", null)
|
||||
}
|
||||
)
|
||||
)
|
||||
|
||||
lifecycle {
|
||||
prevent_destroy = true
|
||||
}
|
||||
}
|
||||
|
||||
resource "grafana_data_source" "datasources_standard" {
|
||||
for_each = local.datasources_standard
|
||||
|
||||
name = each.value.name
|
||||
access_mode = each.value.access_mode
|
||||
type = each.value.type
|
||||
uid = each.value.uid
|
||||
url = lookup(each.value, "url", null)
|
||||
username = lookup(each.value, "username", null)
|
||||
is_default = each.value.is_default
|
||||
org_id = var.org_id
|
||||
http_headers = each.value.http_headers
|
||||
|
||||
|
||||
json_data_encoded = jsonencode(
|
||||
merge(
|
||||
each.value.json_data,
|
||||
{
|
||||
basicAuth = lookup(each.value, "basic_auth", false),
|
||||
basicAuthUser = lookup(each.value, "basic_auth_user", null),
|
||||
protocol = lookup(each.value.json_data, "protocol", "http"),
|
||||
database = lookup(each.value.json_data, "database", null),
|
||||
maxOpenConns = tonumber(lookup(each.value.json_data, "maxOpenConns", null)),
|
||||
maxIdleConns = tonumber(lookup(each.value.json_data, "maxIdleConns", null)),
|
||||
maxIdleConnsAuto = lookup(each.value.json_data, "maxIdleConnsAuto", null),
|
||||
connMaxLifetime = tonumber(lookup(each.value.json_data, "connMaxLifetime", null))
|
||||
}
|
||||
)
|
||||
)
|
||||
|
||||
secure_json_data_encoded = jsonencode(
|
||||
merge(
|
||||
each.value.secure_json_data,
|
||||
{
|
||||
basicAuthPassword = lookup(each.value.secure_json_data, "basicAuthPassword", null),
|
||||
password = lookup(each.value.secure_json_data, "password", null)
|
||||
}
|
||||
)
|
||||
)
|
||||
}
|
||||
17
environments/modules/grafana_datasource/outputs.tf
Normal file
17
environments/modules/grafana_datasource/outputs.tf
Normal file
@ -0,0 +1,17 @@
|
||||
output "datasource_mapping" {
|
||||
description = "Mapping of data source names to their UIDs across all datasource categories"
|
||||
value = merge(
|
||||
{
|
||||
for ds in grafana_data_source.datasources_ignore_and_protect : ds.name => ds.uid if ds.id != null
|
||||
},
|
||||
{
|
||||
for ds in grafana_data_source.datasources_ignore_only : ds.name => ds.uid if ds.id != null
|
||||
},
|
||||
{
|
||||
for ds in grafana_data_source.datasources_protect_only : ds.name => ds.uid if ds.id != null
|
||||
},
|
||||
{
|
||||
for ds in grafana_data_source.datasources_standard : ds.name => ds.uid if ds.id != null
|
||||
}
|
||||
)
|
||||
}
|
||||
32
environments/modules/grafana_datasource/variables.tf
Normal file
32
environments/modules/grafana_datasource/variables.tf
Normal file
@ -0,0 +1,32 @@
|
||||
variable "org_id" {
|
||||
type = string
|
||||
description = "Organization ID where the resources should be created"
|
||||
}
|
||||
|
||||
variable "datasources" {
|
||||
description = "List of Grafana data sources"
|
||||
type = list(object({
|
||||
# Main parameters
|
||||
name = string # Data source name (displayed in Grafana)
|
||||
uid = string # Unique source identifier
|
||||
type = string # Data source type (e.g., prometheus, mysql, clickhouse)
|
||||
url = optional(string, null) # Connection URL (for most sources)
|
||||
username = optional(string, null)
|
||||
access_mode = string # Access mode: proxy or direct
|
||||
is_default = bool # Set as default source
|
||||
|
||||
# Authentication settings
|
||||
basic_auth = optional(bool, false) # Use basic authentication
|
||||
basic_auth_user = optional(string, null) # Username for basic authentication
|
||||
basic_auth_password = optional(string, null) # Password for basic authentication
|
||||
|
||||
# Additional parameters
|
||||
json_data = optional(map(any), {}) # Additional parameters in JSON format
|
||||
secure_json_data = optional(map(string), {}) # Sensitive data in JSON format
|
||||
http_headers = optional(map(string), {})
|
||||
|
||||
# Terraform lifecycle management fields
|
||||
keep_manual_changes = optional(bool, false) # Ignore manual changes in Grafana
|
||||
prevent_destroy_on_recreate = optional(bool, false) # Prevent resource deletion on update
|
||||
}))
|
||||
}
|
||||
7
environments/modules/grafana_datasource/versions.tf
Normal file
7
environments/modules/grafana_datasource/versions.tf
Normal file
@ -0,0 +1,7 @@
|
||||
terraform {
|
||||
required_providers {
|
||||
grafana = {
|
||||
source = "grafana/grafana"
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,7 @@
|
||||
locals {
|
||||
# Define the default contact point name from the contact_points list
|
||||
default_contact_point_name = try(
|
||||
element([for cp in var.contact_points : cp.name if try(cp.is_default, false)], 0),
|
||||
null
|
||||
)
|
||||
}
|
||||
62
environments/modules/grafana_notification_policies/main.tf
Normal file
62
environments/modules/grafana_notification_policies/main.tf
Normal file
@ -0,0 +1,62 @@
|
||||
resource "grafana_notification_policy" "default_policy" {
|
||||
count = local.default_contact_point_name != null ? 1 : 0
|
||||
|
||||
org_id = var.org_id
|
||||
disable_provenance = var.disable_provenance
|
||||
group_by = var.group_by
|
||||
contact_point = local.default_contact_point_name
|
||||
|
||||
group_wait = var.group_wait
|
||||
group_interval = var.group_interval
|
||||
repeat_interval = var.repeat_interval
|
||||
|
||||
dynamic "policy" {
|
||||
for_each = var.notification_policies
|
||||
|
||||
content {
|
||||
contact_point = policy.value.contact_point
|
||||
continue = policy.value.continue
|
||||
group_by = policy.value.group_by
|
||||
group_wait = policy.value.group_wait
|
||||
group_interval = policy.value.group_interval
|
||||
repeat_interval = policy.value.repeat_interval
|
||||
|
||||
dynamic "matcher" {
|
||||
for_each = policy.value.matchers
|
||||
|
||||
content {
|
||||
label = matcher.value.label
|
||||
match = matcher.value.match
|
||||
value = matcher.value.value
|
||||
}
|
||||
}
|
||||
dynamic "policy" {
|
||||
for_each = try(policy.value.policies, [])
|
||||
|
||||
content {
|
||||
contact_point = policy.value.contact_point
|
||||
continue = policy.value.continue
|
||||
group_by = policy.value.group_by
|
||||
group_wait = policy.value.group_wait
|
||||
group_interval = policy.value.group_interval
|
||||
repeat_interval = policy.value.repeat_interval
|
||||
|
||||
dynamic "matcher" {
|
||||
for_each = policy.value.matchers
|
||||
|
||||
content {
|
||||
label = matcher.value.label
|
||||
match = matcher.value.match
|
||||
value = matcher.value.value
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# lifecycle {
|
||||
# prevent_destroy = false
|
||||
# ignore_changes = all
|
||||
# }
|
||||
}
|
||||
@ -0,0 +1,76 @@
|
||||
variable "contact_points" {
|
||||
description = "List of contact points"
|
||||
type = list(object({
|
||||
name = string
|
||||
type = string
|
||||
is_default = optional(bool, false)
|
||||
labels = optional(map(string))
|
||||
settings = map(string)
|
||||
}))
|
||||
}
|
||||
|
||||
variable "org_id" {
|
||||
description = "Grafana organization ID"
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "disable_provenance" {
|
||||
description = "Controls whether Grafana provisioning is disabled"
|
||||
type = bool
|
||||
default = true
|
||||
}
|
||||
|
||||
variable "group_by" {
|
||||
description = "A list of alert labels to group alerts into notifications"
|
||||
type = list(string)
|
||||
default = ["alertname"]
|
||||
}
|
||||
|
||||
variable "group_wait" {
|
||||
description = "Time to wait to buffer alerts of the same group before sending a notification"
|
||||
type = string
|
||||
default = "30s"
|
||||
}
|
||||
|
||||
variable "group_interval" {
|
||||
description = "Minimum time interval between two notifications for the same group"
|
||||
type = string
|
||||
default = "5m"
|
||||
}
|
||||
|
||||
variable "repeat_interval" {
|
||||
description = "Minimum time interval for re-sending a notification if an alert is still firing"
|
||||
type = string
|
||||
default = "4h"
|
||||
}
|
||||
|
||||
variable "notification_policies" {
|
||||
description = "Routing rules for specific label sets"
|
||||
type = list(object({
|
||||
contact_point = string
|
||||
continue = optional(bool)
|
||||
group_by = optional(list(string))
|
||||
group_wait = optional(string)
|
||||
group_interval = optional(string)
|
||||
repeat_interval = optional(string)
|
||||
matchers = list(object({
|
||||
label = string
|
||||
match = string # Allowed operators are = for equality, != for negated equality, =~ for regex equality, and !~ for negated regex equality
|
||||
value = string
|
||||
}))
|
||||
policies = optional(list(object({
|
||||
contact_point = string
|
||||
continue = optional(bool)
|
||||
group_by = optional(list(string))
|
||||
group_wait = optional(string)
|
||||
group_interval = optional(string)
|
||||
repeat_interval = optional(string)
|
||||
matchers = list(object({
|
||||
label = string
|
||||
match = string
|
||||
value = string
|
||||
}))
|
||||
})), [])
|
||||
}))
|
||||
default = []
|
||||
}
|
||||
@ -0,0 +1,7 @@
|
||||
terraform {
|
||||
required_providers {
|
||||
grafana = {
|
||||
source = "grafana/grafana"
|
||||
}
|
||||
}
|
||||
}
|
||||
65
environments/modules/grafana_rule_group/locals.tf
Normal file
65
environments/modules/grafana_rule_group/locals.tf
Normal file
@ -0,0 +1,65 @@
|
||||
locals {
|
||||
# Duration parsing map
|
||||
duration_units = {
|
||||
"s" = 1
|
||||
"m" = 60
|
||||
"h" = 3600
|
||||
"d" = 86400
|
||||
}
|
||||
|
||||
# Mapping for datasources
|
||||
datasource_mapping = { for ds in var.datasources : ds.uid => ds.name }
|
||||
datasource_mapping_type = { for ds in var.datasources : ds.uid => lookup(ds, "type", "prometheus") }
|
||||
|
||||
# Folder time range mapping with 1-minute default
|
||||
folder_time_ranges = {
|
||||
for uid in distinct([for group in var.groups : lookup(var.folder_uids, group.dashboard_alert_group_name, null) if group.dashboard_alert_group_name != null]) :
|
||||
uid => lookup(var.folder_time_ranges, uid, 60) # Default to 60 seconds (1 minute) if not specified
|
||||
}
|
||||
|
||||
# Combine all alerts and their respective configurations
|
||||
combined_alerts = flatten([
|
||||
for group in var.groups : [
|
||||
for datasource_uid in group.alerts_on_datasources_uid :
|
||||
{
|
||||
alert_group_name = group.dashboard_alert_group_name
|
||||
folder_uid = lookup(var.folder_uids, group.dashboard_alert_group_name, null)
|
||||
datasource_name = lookup(local.datasource_mapping, datasource_uid, "unknown")
|
||||
datasource_uid = datasource_uid
|
||||
datasource_type = lookup(local.datasource_mapping_type, datasource_uid, "prometheus")
|
||||
|
||||
alert_files = [
|
||||
for file_path in fileset(group.alert_definitions_path, "**/*.yaml") :
|
||||
{
|
||||
# Store full YAML content
|
||||
content = yamldecode(file("${group.alert_definitions_path}/${file_path}"))
|
||||
|
||||
# Extract commonly used fields
|
||||
name = try(yamldecode(file("${group.alert_definitions_path}/${file_path}")).name, null)
|
||||
alert_type = try(yamldecode(file("${group.alert_definitions_path}/${file_path}")).datasource_type, "prometheus")
|
||||
editor_type = try(yamldecode(file("${group.alert_definitions_path}/${file_path}")).editor_type, null)
|
||||
mode = try(yamldecode(file("${group.alert_definitions_path}/${file_path}")).mode, "single")
|
||||
|
||||
# File metadata
|
||||
alert_file_path = "${group.alert_definitions_path}/${file_path}"
|
||||
alert_category = split("/", file_path)[0]
|
||||
}
|
||||
if can(group.alert_definitions_path) &&
|
||||
group.alert_definitions_path != null &&
|
||||
(try(trimspace(group.alert_definitions_path), "") != "")
|
||||
]
|
||||
}
|
||||
]
|
||||
])
|
||||
|
||||
# Group alerts by datasource
|
||||
grouped_alerts_by_datasource = {
|
||||
for alert in local.combined_alerts :
|
||||
"${alert.datasource_name} (${alert.alert_group_name})" => merge(alert, {
|
||||
alert_files = flatten([
|
||||
for a in local.combined_alerts :
|
||||
a.alert_files if a.datasource_name == alert.datasource_name && a.alert_group_name == alert.alert_group_name
|
||||
])
|
||||
})
|
||||
}
|
||||
}
|
||||
224
environments/modules/grafana_rule_group/main.tf
Normal file
224
environments/modules/grafana_rule_group/main.tf
Normal file
@ -0,0 +1,224 @@
|
||||
resource "grafana_rule_group" "alert_groups" {
|
||||
for_each = {
|
||||
for k, v in local.grouped_alerts_by_datasource :
|
||||
k => v if length(v.alert_files) > 0
|
||||
}
|
||||
|
||||
# Main parameters
|
||||
name = each.key
|
||||
org_id = var.org_id
|
||||
interval_seconds = var.default_evaluation_interval # Group-wide evaluation interval
|
||||
folder_uid = each.value.folder_uid
|
||||
disable_provenance = var.disable_provenance
|
||||
|
||||
# Rules configuration
|
||||
dynamic "rule" {
|
||||
for_each = each.value.alert_files
|
||||
content {
|
||||
name = "${rule.value.name} (${each.value.datasource_uid})"
|
||||
# Business alerts: use math node D as condition
|
||||
# Simple alerts: use threshold node T
|
||||
condition = length(try(rule.value.content.functions, [])) > 0 ? try(rule.value.content.functions[length(rule.value.content.functions) - 1].math.ref_id, "T") : "T"
|
||||
for = try(
|
||||
coalesce(
|
||||
# Try to parse duration string (e.g., "15m", "24h")
|
||||
can(regex("^[0-9]+(s|m|h|d)$", rule.value.content.for)) ? format(
|
||||
"%ds",
|
||||
tonumber(regex("^([0-9]+)", rule.value.content.for)[0]) *
|
||||
lookup(local.duration_units, regex("[smhd]$", rule.value.content.for), 1)
|
||||
) : null,
|
||||
# Fallback to frequency or default duration
|
||||
format("%ds", try(rule.value.content.frequency, var.default_alert_duration))
|
||||
),
|
||||
format("%ds", var.default_alert_duration)
|
||||
)
|
||||
|
||||
# Data configuration
|
||||
# Single data block for the alert expression
|
||||
# Both SQL and Prometheus alerts use a single expression
|
||||
dynamic "data" {
|
||||
for_each = can(rule.value.content.queries) ? [
|
||||
for ref_id, query in rule.value.content.queries : {
|
||||
ref_id = ref_id
|
||||
query = query
|
||||
}
|
||||
] : [{ ref_id = "A", query = try(rule.value.content.expression, "") }]
|
||||
content {
|
||||
ref_id = data.value.ref_id
|
||||
datasource_uid = each.value.datasource_uid
|
||||
query_type = lookup(local.datasource_mapping_type, each.value.datasource_uid, "prometheus")
|
||||
model = jsonencode(
|
||||
merge(
|
||||
{
|
||||
refId = data.value.ref_id
|
||||
intervalMs = var.default_interval_ms
|
||||
maxDataPoints = var.default_max_data_points
|
||||
instant = false
|
||||
datasource = {
|
||||
type = lookup(local.datasource_mapping_type, each.value.datasource_uid, "prometheus")
|
||||
uid = each.value.datasource_uid
|
||||
}
|
||||
},
|
||||
|
||||
# The query type is determined by the datasource type
|
||||
# The expression field contains the actual query for both SQL and Prometheus
|
||||
# Query configuration based on datasource type
|
||||
# Handle SQL-based datasources (mysql, clickhouse) and prometheus-compatible ones
|
||||
# SQL query configuration (mysql, clickhouse)
|
||||
contains(["grafana-clickhouse-datasource"],
|
||||
lookup(local.datasource_mapping_type, each.value.datasource_uid, "prometheus")) ? {
|
||||
# Use time_series format for queries with time grouping, table format for direct aggregation
|
||||
format = can(regex("\\$__timeGroupAlias", data.value.query)) ? "time_series" : null
|
||||
formatAs = can(regex("\\$__timeGroupAlias", data.value.query)) ? null : "table"
|
||||
queryType = "sql"
|
||||
rawSql = data.value.query
|
||||
editorMode = "code"
|
||||
editorType = "sql"
|
||||
} : {
|
||||
# Prometheus-compatible datasources (prometheus, victoriametrics)
|
||||
expr = try(rule.value.content.expression, "")
|
||||
format = "time_series"
|
||||
queryType = lookup(local.datasource_mapping_type, each.value.datasource_uid, "prometheus")
|
||||
}
|
||||
)
|
||||
)
|
||||
relative_time_range {
|
||||
from = try(
|
||||
rule.value.content.relative_time_range.from, # First try alert's own config
|
||||
lookup( # Then try folder settings
|
||||
local.folder_time_ranges,
|
||||
each.value.folder_uid,
|
||||
var.default_time_range_from # Finally fallback to global default
|
||||
)
|
||||
)
|
||||
to = 0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Unified reduction processing
|
||||
# Handle both function-based and simple reductions
|
||||
dynamic "data" {
|
||||
for_each = length(try(rule.value.content.functions, [])) > 0 ? [
|
||||
for func in rule.value.content.functions : {
|
||||
ref_id = try(func.reduce.ref_id, "B")
|
||||
expression = try(func.reduce.input, "A")
|
||||
reducer = try(func.reduce.function, "last")
|
||||
mode = try(func.reduce.mode, "strict")
|
||||
} if try(func.reduce, null) != null
|
||||
] : try(rule.value.content.need_reduce, false) ? [{
|
||||
ref_id = "B"
|
||||
expression = "A"
|
||||
# Map 'avg' reducer to 'mean' which is supported by Grafana
|
||||
# Other reducers (last, max, min, sum) are already supported
|
||||
reducer = try(
|
||||
rule.value.content.reducer_type == "avg" ? "mean" : rule.value.content.reducer_type,
|
||||
"last"
|
||||
)
|
||||
mode = "strict"
|
||||
}] : []
|
||||
content {
|
||||
# Use exact ref_id and values from the for_each structure
|
||||
ref_id = data.value.ref_id
|
||||
datasource_uid = "__expr__"
|
||||
model = jsonencode({
|
||||
refId = data.value.ref_id
|
||||
type = "reduce"
|
||||
expression = data.value.expression
|
||||
reducer = data.value.reducer
|
||||
mode = data.value.mode
|
||||
})
|
||||
relative_time_range {
|
||||
from = try(rule.value.content.relative_time_range.from, var.default_processing_range)
|
||||
to = try(rule.value.content.relative_time_range.to, 0)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Math expressions processing
|
||||
# Node references in Grafana alerts:
|
||||
# - Node A: Initial query (SQL or Prometheus)
|
||||
# - Node B: Reduction operation (created when need_reduce is true)
|
||||
# - Node M: Math expression (optional, for complex calculations)
|
||||
# - Node C: Final threshold evaluation
|
||||
dynamic "data" {
|
||||
for_each = length(try(rule.value.content.functions, [])) > 0 ? [
|
||||
for idx, func in rule.value.content.functions : {
|
||||
ref_id = try(func.math.ref_id, "M${idx}")
|
||||
expression = func.math.expression
|
||||
} if try(func.math, null) != null
|
||||
] : try(rule.value.content.math_expression, null) != null ? [{
|
||||
ref_id = "M"
|
||||
expression = rule.value.content.math_expression
|
||||
}] : []
|
||||
content {
|
||||
ref_id = data.value.ref_id
|
||||
datasource_uid = "__expr__"
|
||||
model = jsonencode({
|
||||
refId = data.value.ref_id
|
||||
type = "math"
|
||||
expression = data.value.expression
|
||||
input = try(rule.value.content.need_reduce ? "B" : "A", "A")
|
||||
})
|
||||
relative_time_range {
|
||||
from = try(rule.value.content.relative_time_range.from, var.default_processing_range)
|
||||
to = try(rule.value.content.relative_time_range.to, 0)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Unified threshold evaluation
|
||||
data {
|
||||
ref_id = "T" # Use T consistently for threshold
|
||||
datasource_uid = "__expr__"
|
||||
model = jsonencode({
|
||||
refId = "T"
|
||||
type = "threshold"
|
||||
# Simple alerts: evaluate B (with reduction) or A (without reduction)
|
||||
expression = try(rule.value.content.need_reduce ? "B" : "A", "A")
|
||||
conditions = [
|
||||
{
|
||||
evaluator = merge(
|
||||
{
|
||||
type = rule.value.content.condition_type
|
||||
},
|
||||
# Handle range conditions for site monitoring
|
||||
contains(["outside_range", "within_range"], rule.value.content.condition_type) ? {
|
||||
params = rule.value.content.threshold_range
|
||||
} : {
|
||||
# Handle single threshold for business/system alerts
|
||||
params = [rule.value.content.threshold]
|
||||
}
|
||||
)
|
||||
operator = { type = "and" }
|
||||
# Use 'mean' reducer for consistency (same as mapping 'avg' to 'mean' in reductions)
|
||||
reducer = { type = "mean", params = [] }
|
||||
query = { params = [] }
|
||||
type = "query"
|
||||
}
|
||||
]
|
||||
})
|
||||
relative_time_range {
|
||||
from = try(rule.value.content.relative_time_range.from, var.default_processing_range)
|
||||
to = try(rule.value.content.relative_time_range.to, 0)
|
||||
}
|
||||
}
|
||||
|
||||
# Rule metadata
|
||||
annotations = {
|
||||
summary = rule.value.content.summary
|
||||
description = try(rule.value.content.description, "")
|
||||
threshold = try(
|
||||
contains(["outside_range", "within_range"], try(rule.value.content.condition_type, "gt")) ?
|
||||
tostring(try(rule.value.content.threshold_range[0], "")) :
|
||||
tostring(try(rule.value.content.threshold, ""))
|
||||
)
|
||||
}
|
||||
|
||||
labels = rule.value.content.labels
|
||||
|
||||
no_data_state = try(rule.value.content.no_data_state, var.default_no_data_state)
|
||||
exec_err_state = try(rule.value.content.exec_err_state, var.default_exec_err_state)
|
||||
}
|
||||
}
|
||||
}
|
||||
122
environments/modules/grafana_rule_group/variables.tf
Normal file
122
environments/modules/grafana_rule_group/variables.tf
Normal file
@ -0,0 +1,122 @@
|
||||
variable "groups" {
|
||||
description = "List of alert groups with their definitions and data sources"
|
||||
type = list(object({
|
||||
dashboard_alert_group_name = string
|
||||
alert_definitions_path = optional(string, null)
|
||||
dashboard_path_if_exist = optional(string, null)
|
||||
keep_manual_changes = optional(bool, false)
|
||||
prevent_destroy_on_recreate = optional(bool, false)
|
||||
alerts_on_datasources_uid = list(string)
|
||||
}))
|
||||
}
|
||||
|
||||
variable "datasources" {
|
||||
description = "List of Grafana data sources"
|
||||
type = list(object({
|
||||
# Main parameters
|
||||
name = string # Data source name (displayed in Grafana)
|
||||
uid = string # Unique source identifier
|
||||
type = string # Data source type (e.g., prometheus, mysql, clickhouse)
|
||||
url = optional(string, null) # Connection URL (for most sources)
|
||||
username = optional(string, null)
|
||||
access_mode = string # Access mode: proxy or direct
|
||||
is_default = bool # Set as default source
|
||||
# Authentication settings
|
||||
basic_auth = optional(bool, false) # Use basic authentication
|
||||
basic_auth_user = optional(string, null) # Username for basic authentication
|
||||
basic_auth_password = optional(string, null) # Password for basic authentication
|
||||
|
||||
# Additional parameters
|
||||
json_data = optional(map(any), {}) # Additional parameters in JSON format
|
||||
secure_json_data = optional(map(string), {}) # Sensitive data in JSON format
|
||||
|
||||
# Terraform lifecycle management fields
|
||||
keep_manual_changes = optional(bool, false) # Ignore manual changes in Grafana
|
||||
prevent_destroy_on_recreate = optional(bool, false) # Prevent resource deletion on update
|
||||
}))
|
||||
}
|
||||
|
||||
variable "org_id" {
|
||||
description = "ID of the Grafana organization"
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "folder_uids" {
|
||||
description = "Mapping of alert group names to their folder UIDs"
|
||||
type = map(string)
|
||||
}
|
||||
|
||||
variable "folder_time_ranges" {
|
||||
description = <<-EOT
|
||||
Mapping of folder UIDs to their default time ranges in seconds.
|
||||
If not specified for a folder, alerts will use the folder's default of 60 seconds (1 minute).
|
||||
Example:
|
||||
{
|
||||
"folder1_uid" = 300 # 5 minutes
|
||||
"folder2_uid" = 3600 # 1 hour
|
||||
}
|
||||
EOT
|
||||
type = map(number)
|
||||
default = {}
|
||||
}
|
||||
|
||||
# Alert duration and timing configuration
|
||||
variable "interval_seconds" {
|
||||
description = "Interval in seconds for evaluating alerts"
|
||||
type = number
|
||||
default = 60
|
||||
}
|
||||
|
||||
variable "default_interval_ms" {
|
||||
description = "Default interval in milliseconds for evaluating alert expressions"
|
||||
type = number
|
||||
default = 60000
|
||||
}
|
||||
|
||||
variable "default_max_data_points" {
|
||||
description = "Default maximum number of data points"
|
||||
type = number
|
||||
default = 43200
|
||||
}
|
||||
|
||||
variable "default_no_data_state" {
|
||||
description = "Default no data state for alerts"
|
||||
type = string
|
||||
default = "OK"
|
||||
}
|
||||
|
||||
variable "default_exec_err_state" {
|
||||
description = "Default execution error state for alerts"
|
||||
type = string
|
||||
default = "Error"
|
||||
}
|
||||
|
||||
variable "default_alert_duration" {
|
||||
description = "Default duration (in seconds) for how long a condition must be true before alerting"
|
||||
type = number
|
||||
default = 300 # 5 minutes
|
||||
}
|
||||
|
||||
variable "default_evaluation_interval" {
|
||||
description = "Default interval (in seconds) between alert rule evaluations"
|
||||
type = number
|
||||
default = 60 # 1 minute
|
||||
}
|
||||
|
||||
variable "default_time_range_from" {
|
||||
description = "Default time range (in seconds) for main query lookback"
|
||||
type = number
|
||||
default = 604800 # 7 days
|
||||
}
|
||||
|
||||
variable "default_processing_range" {
|
||||
description = "Default time range (in seconds) for processing blocks"
|
||||
type = number
|
||||
default = 600 # 10 minutes
|
||||
}
|
||||
|
||||
variable "disable_provenance" {
|
||||
description = "Controls whether Grafana provisioning is disabled"
|
||||
type = bool
|
||||
default = true
|
||||
}
|
||||
7
environments/modules/grafana_rule_group/versions.tf
Normal file
7
environments/modules/grafana_rule_group/versions.tf
Normal file
@ -0,0 +1,7 @@
|
||||
terraform {
|
||||
required_providers {
|
||||
grafana = {
|
||||
source = "grafana/grafana"
|
||||
}
|
||||
}
|
||||
}
|
||||
65
environments/modules/grafana_rule_group_test/locals.tf
Normal file
65
environments/modules/grafana_rule_group_test/locals.tf
Normal file
@ -0,0 +1,65 @@
|
||||
locals {
|
||||
# Duration parsing map
|
||||
duration_units = {
|
||||
"s" = 1
|
||||
"m" = 60
|
||||
"h" = 3600
|
||||
"d" = 86400
|
||||
}
|
||||
|
||||
# Mapping for datasources
|
||||
datasource_mapping = { for ds in var.datasources : ds.uid => ds.name }
|
||||
datasource_mapping_type = { for ds in var.datasources : ds.uid => lookup(ds, "type", "prometheus") }
|
||||
|
||||
# Folder time range mapping with 1-minute default
|
||||
folder_time_ranges = {
|
||||
for uid in distinct([for group in var.groups : lookup(var.folder_uids, group.dashboard_alert_group_name, null) if group.dashboard_alert_group_name != null]) :
|
||||
uid => lookup(var.folder_time_ranges, uid, 60) # Default to 60 seconds (1 minute) if not specified
|
||||
}
|
||||
|
||||
# Combine all alerts and their respective configurations
|
||||
combined_alerts = flatten([
|
||||
for group in var.groups : [
|
||||
for datasource_uid in group.alerts_on_datasources_uid :
|
||||
{
|
||||
alert_group_name = group.dashboard_alert_group_name
|
||||
folder_uid = lookup(var.folder_uids, group.dashboard_alert_group_name, null)
|
||||
datasource_name = lookup(local.datasource_mapping, datasource_uid, "unknown")
|
||||
datasource_uid = datasource_uid
|
||||
datasource_type = lookup(local.datasource_mapping_type, datasource_uid, "prometheus")
|
||||
|
||||
alert_files = [
|
||||
for file_path in fileset(group.alert_definitions_path, "**/*.yaml") :
|
||||
{
|
||||
# Store full YAML content after variable substitution
|
||||
content = yamldecode(templatefile("${group.alert_definitions_path}/${file_path}", var.alert_variables))
|
||||
|
||||
# Extract commonly used fields
|
||||
name = try(yamldecode(templatefile("${group.alert_definitions_path}/${file_path}", var.alert_variables)).name, null)
|
||||
alert_type = try(yamldecode(templatefile("${group.alert_definitions_path}/${file_path}", var.alert_variables)).datasource_type, "prometheus")
|
||||
editor_type = try(yamldecode(templatefile("${group.alert_definitions_path}/${file_path}", var.alert_variables)).editor_type, null)
|
||||
mode = try(yamldecode(templatefile("${group.alert_definitions_path}/${file_path}", var.alert_variables)).mode, "single")
|
||||
|
||||
# File metadata
|
||||
alert_file_path = "${group.alert_definitions_path}/${file_path}"
|
||||
alert_category = split("/", file_path)[0]
|
||||
}
|
||||
if can(group.alert_definitions_path) &&
|
||||
group.alert_definitions_path != null &&
|
||||
(try(trimspace(group.alert_definitions_path), "") != "")
|
||||
]
|
||||
}
|
||||
]
|
||||
])
|
||||
|
||||
# Group alerts by datasource
|
||||
grouped_alerts_by_datasource = {
|
||||
for alert in local.combined_alerts :
|
||||
"${alert.datasource_name} (${alert.alert_group_name})" => merge(alert, {
|
||||
alert_files = flatten([
|
||||
for a in local.combined_alerts :
|
||||
a.alert_files if a.datasource_name == alert.datasource_name && a.alert_group_name == alert.alert_group_name
|
||||
])
|
||||
})
|
||||
}
|
||||
}
|
||||
224
environments/modules/grafana_rule_group_test/main.tf
Normal file
224
environments/modules/grafana_rule_group_test/main.tf
Normal file
@ -0,0 +1,224 @@
|
||||
resource "grafana_rule_group" "alert_groups" {
|
||||
for_each = {
|
||||
for k, v in local.grouped_alerts_by_datasource :
|
||||
k => v if length(v.alert_files) > 0
|
||||
}
|
||||
|
||||
# Main parameters
|
||||
name = each.key
|
||||
org_id = var.org_id
|
||||
interval_seconds = var.default_evaluation_interval # Group-wide evaluation interval
|
||||
folder_uid = each.value.folder_uid
|
||||
disable_provenance = var.disable_provenance
|
||||
|
||||
# Rules configuration
|
||||
dynamic "rule" {
|
||||
for_each = each.value.alert_files
|
||||
content {
|
||||
name = "${rule.value.name} (${each.value.datasource_uid})"
|
||||
# Business alerts: use math node D as condition
|
||||
# Simple alerts: use threshold node T
|
||||
condition = length(try(rule.value.content.functions, [])) > 0 ? try(rule.value.content.functions[length(rule.value.content.functions) - 1].math.ref_id, "T") : "T"
|
||||
for = try(
|
||||
coalesce(
|
||||
# Try to parse duration string (e.g., "15m", "24h")
|
||||
can(regex("^[0-9]+(s|m|h|d)$", rule.value.content.for)) ? format(
|
||||
"%ds",
|
||||
tonumber(regex("^([0-9]+)", rule.value.content.for)[0]) *
|
||||
lookup(local.duration_units, regex("[smhd]$", rule.value.content.for), 1)
|
||||
) : null,
|
||||
# Fallback to frequency or default duration
|
||||
format("%ds", try(rule.value.content.frequency, var.default_alert_duration))
|
||||
),
|
||||
format("%ds", var.default_alert_duration)
|
||||
)
|
||||
|
||||
# Data configuration
|
||||
# Single data block for the alert expression
|
||||
# Both SQL and Prometheus alerts use a single expression
|
||||
dynamic "data" {
|
||||
for_each = can(rule.value.content.queries) ? [
|
||||
for ref_id, query in rule.value.content.queries : {
|
||||
ref_id = ref_id
|
||||
query = query
|
||||
}
|
||||
] : [{ ref_id = "A", query = try(rule.value.content.expression, "") }]
|
||||
content {
|
||||
ref_id = data.value.ref_id
|
||||
datasource_uid = each.value.datasource_uid
|
||||
query_type = lookup(local.datasource_mapping_type, each.value.datasource_uid, "prometheus")
|
||||
model = jsonencode(
|
||||
merge(
|
||||
{
|
||||
refId = data.value.ref_id
|
||||
intervalMs = var.default_interval_ms
|
||||
maxDataPoints = var.default_max_data_points
|
||||
instant = false
|
||||
datasource = {
|
||||
type = lookup(local.datasource_mapping_type, each.value.datasource_uid, "prometheus")
|
||||
uid = each.value.datasource_uid
|
||||
}
|
||||
},
|
||||
|
||||
# The query type is determined by the datasource type
|
||||
# The expression field contains the actual query for both SQL and Prometheus
|
||||
# Query configuration based on datasource type
|
||||
# Handle SQL-based datasources (mysql, clickhouse) and prometheus-compatible ones
|
||||
# SQL query configuration (mysql, clickhouse)
|
||||
contains(["grafana-clickhouse-datasource"],
|
||||
lookup(local.datasource_mapping_type, each.value.datasource_uid, "prometheus")) ? {
|
||||
# Use time_series format for queries with time grouping, table format for direct aggregation
|
||||
format = can(regex("\\$__timeGroupAlias", data.value.query)) ? "time_series" : null
|
||||
formatAs = can(regex("\\$__timeGroupAlias", data.value.query)) ? null : "table"
|
||||
queryType = "sql"
|
||||
rawSql = data.value.query
|
||||
editorMode = "code"
|
||||
editorType = "sql"
|
||||
} : {
|
||||
# Prometheus-compatible datasources (prometheus, victoriametrics)
|
||||
expr = try(rule.value.content.expression, "")
|
||||
format = "time_series"
|
||||
queryType = lookup(local.datasource_mapping_type, each.value.datasource_uid, "prometheus")
|
||||
}
|
||||
)
|
||||
)
|
||||
relative_time_range {
|
||||
from = try(
|
||||
rule.value.content.relative_time_range.from, # First try alert's own config
|
||||
lookup( # Then try folder settings
|
||||
local.folder_time_ranges,
|
||||
each.value.folder_uid,
|
||||
var.default_time_range_from # Finally fallback to global default
|
||||
)
|
||||
)
|
||||
to = 0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Unified reduction processing
|
||||
# Handle both function-based and simple reductions
|
||||
dynamic "data" {
|
||||
for_each = length(try(rule.value.content.functions, [])) > 0 ? [
|
||||
for func in rule.value.content.functions : {
|
||||
ref_id = try(func.reduce.ref_id, "B")
|
||||
expression = try(func.reduce.input, "A")
|
||||
reducer = try(func.reduce.function, "last")
|
||||
mode = try(func.reduce.mode, "strict")
|
||||
} if try(func.reduce, null) != null
|
||||
] : try(rule.value.content.need_reduce, false) ? [{
|
||||
ref_id = "B"
|
||||
expression = "A"
|
||||
# Map 'avg' reducer to 'mean' which is supported by Grafana
|
||||
# Other reducers (last, max, min, sum) are already supported
|
||||
reducer = try(
|
||||
rule.value.content.reducer_type == "avg" ? "mean" : rule.value.content.reducer_type,
|
||||
"last"
|
||||
)
|
||||
mode = "strict"
|
||||
}] : []
|
||||
content {
|
||||
# Use exact ref_id and values from the for_each structure
|
||||
ref_id = data.value.ref_id
|
||||
datasource_uid = "__expr__"
|
||||
model = jsonencode({
|
||||
refId = data.value.ref_id
|
||||
type = "reduce"
|
||||
expression = data.value.expression
|
||||
reducer = data.value.reducer
|
||||
mode = data.value.mode
|
||||
})
|
||||
relative_time_range {
|
||||
from = try(rule.value.content.relative_time_range.from, var.default_processing_range)
|
||||
to = try(rule.value.content.relative_time_range.to, 0)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Math expressions processing
|
||||
# Node references in Grafana alerts:
|
||||
# - Node A: Initial query (SQL or Prometheus)
|
||||
# - Node B: Reduction operation (created when need_reduce is true)
|
||||
# - Node M: Math expression (optional, for complex calculations)
|
||||
# - Node C: Final threshold evaluation
|
||||
dynamic "data" {
|
||||
for_each = length(try(rule.value.content.functions, [])) > 0 ? [
|
||||
for idx, func in rule.value.content.functions : {
|
||||
ref_id = try(func.math.ref_id, "M${idx}")
|
||||
expression = func.math.expression
|
||||
} if try(func.math, null) != null
|
||||
] : try(rule.value.content.math_expression, null) != null ? [{
|
||||
ref_id = "M"
|
||||
expression = rule.value.content.math_expression
|
||||
}] : []
|
||||
content {
|
||||
ref_id = data.value.ref_id
|
||||
datasource_uid = "__expr__"
|
||||
model = jsonencode({
|
||||
refId = data.value.ref_id
|
||||
type = "math"
|
||||
expression = data.value.expression
|
||||
input = try(rule.value.content.need_reduce ? "B" : "A", "A")
|
||||
})
|
||||
relative_time_range {
|
||||
from = try(rule.value.content.relative_time_range.from, var.default_processing_range)
|
||||
to = try(rule.value.content.relative_time_range.to, 0)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Unified threshold evaluation
|
||||
data {
|
||||
ref_id = "T" # Use T consistently for threshold
|
||||
datasource_uid = "__expr__"
|
||||
model = jsonencode({
|
||||
refId = "T"
|
||||
type = "threshold"
|
||||
# Simple alerts: evaluate B (with reduction) or A (without reduction)
|
||||
expression = try(rule.value.content.need_reduce ? "B" : "A", "A")
|
||||
conditions = [
|
||||
{
|
||||
evaluator = merge(
|
||||
{
|
||||
type = rule.value.content.condition_type
|
||||
},
|
||||
# Handle range conditions for site monitoring
|
||||
contains(["outside_range", "within_range"], rule.value.content.condition_type) ? {
|
||||
params = rule.value.content.threshold_range
|
||||
} : {
|
||||
# Handle single threshold for business/system alerts
|
||||
params = [rule.value.content.threshold]
|
||||
}
|
||||
)
|
||||
operator = { type = "and" }
|
||||
# Use 'mean' reducer for consistency (same as mapping 'avg' to 'mean' in reductions)
|
||||
reducer = { type = "mean", params = [] }
|
||||
query = { params = [] }
|
||||
type = "query"
|
||||
}
|
||||
]
|
||||
})
|
||||
relative_time_range {
|
||||
from = try(rule.value.content.relative_time_range.from, var.default_processing_range)
|
||||
to = try(rule.value.content.relative_time_range.to, 0)
|
||||
}
|
||||
}
|
||||
|
||||
# Rule metadata
|
||||
annotations = {
|
||||
summary = rule.value.content.summary
|
||||
description = try(rule.value.content.description, "")
|
||||
threshold = try(
|
||||
contains(["outside_range", "within_range"], try(rule.value.content.condition_type, "gt")) ?
|
||||
tostring(try(rule.value.content.threshold_range[0], "")) :
|
||||
tostring(try(rule.value.content.threshold, ""))
|
||||
)
|
||||
}
|
||||
|
||||
labels = rule.value.content.labels
|
||||
|
||||
no_data_state = try(rule.value.content.no_data_state, var.default_no_data_state)
|
||||
exec_err_state = try(rule.value.content.exec_err_state, var.default_exec_err_state)
|
||||
}
|
||||
}
|
||||
}
|
||||
128
environments/modules/grafana_rule_group_test/variables.tf
Normal file
128
environments/modules/grafana_rule_group_test/variables.tf
Normal file
@ -0,0 +1,128 @@
|
||||
variable "groups" {
|
||||
description = "List of alert groups with their definitions and data sources"
|
||||
type = list(object({
|
||||
dashboard_alert_group_name = string
|
||||
alert_definitions_path = optional(string, null)
|
||||
dashboard_path_if_exist = optional(string, null)
|
||||
keep_manual_changes = optional(bool, false)
|
||||
prevent_destroy_on_recreate = optional(bool, false)
|
||||
alerts_on_datasources_uid = list(string)
|
||||
}))
|
||||
}
|
||||
|
||||
variable "datasources" {
|
||||
description = "List of Grafana data sources"
|
||||
type = list(object({
|
||||
# Main parameters
|
||||
name = string # Data source name (displayed in Grafana)
|
||||
uid = string # Unique source identifier
|
||||
type = string # Data source type (e.g., prometheus, mysql, clickhouse)
|
||||
url = optional(string, null) # Connection URL (for most sources)
|
||||
username = optional(string, null)
|
||||
access_mode = string # Access mode: proxy or direct
|
||||
is_default = bool # Set as default source
|
||||
# Authentication settings
|
||||
basic_auth = optional(bool, false) # Use basic authentication
|
||||
basic_auth_user = optional(string, null) # Username for basic authentication
|
||||
basic_auth_password = optional(string, null) # Password for basic authentication
|
||||
|
||||
# Additional parameters
|
||||
json_data = optional(map(any), {}) # Additional parameters in JSON format
|
||||
secure_json_data = optional(map(string), {}) # Sensitive data in JSON format
|
||||
|
||||
# Terraform lifecycle management fields
|
||||
keep_manual_changes = optional(bool, false) # Ignore manual changes in Grafana
|
||||
prevent_destroy_on_recreate = optional(bool, false) # Prevent resource deletion on update
|
||||
}))
|
||||
}
|
||||
|
||||
variable "org_id" {
|
||||
description = "ID of the Grafana organization"
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "folder_uids" {
|
||||
description = "Mapping of alert group names to their folder UIDs"
|
||||
type = map(string)
|
||||
}
|
||||
|
||||
variable "folder_time_ranges" {
|
||||
description = <<-EOT
|
||||
Mapping of folder UIDs to their default time ranges in seconds.
|
||||
If not specified for a folder, alerts will use the folder's default of 60 seconds (1 minute).
|
||||
Example:
|
||||
{
|
||||
"folder1_uid" = 300 # 5 minutes
|
||||
"folder2_uid" = 3600 # 1 hour
|
||||
}
|
||||
EOT
|
||||
type = map(number)
|
||||
default = {}
|
||||
}
|
||||
|
||||
# Alert duration and timing configuration
|
||||
variable "interval_seconds" {
|
||||
description = "Interval in seconds for evaluating alerts"
|
||||
type = number
|
||||
default = 60
|
||||
}
|
||||
|
||||
variable "default_interval_ms" {
|
||||
description = "Default interval in milliseconds for evaluating alert expressions"
|
||||
type = number
|
||||
default = 60000
|
||||
}
|
||||
|
||||
variable "default_max_data_points" {
|
||||
description = "Default maximum number of data points"
|
||||
type = number
|
||||
default = 43200
|
||||
}
|
||||
|
||||
variable "default_no_data_state" {
|
||||
description = "Default no data state for alerts"
|
||||
type = string
|
||||
default = "OK"
|
||||
}
|
||||
|
||||
variable "default_exec_err_state" {
|
||||
description = "Default execution error state for alerts"
|
||||
type = string
|
||||
default = "Error"
|
||||
}
|
||||
|
||||
variable "default_alert_duration" {
|
||||
description = "Default duration (in seconds) for how long a condition must be true before alerting"
|
||||
type = number
|
||||
default = 300 # 5 minutes
|
||||
}
|
||||
|
||||
variable "default_evaluation_interval" {
|
||||
description = "Default interval (in seconds) between alert rule evaluations"
|
||||
type = number
|
||||
default = 60 # 1 minute
|
||||
}
|
||||
|
||||
variable "default_time_range_from" {
|
||||
description = "Default time range (in seconds) for main query lookback"
|
||||
type = number
|
||||
default = 604800 # 7 days
|
||||
}
|
||||
|
||||
variable "default_processing_range" {
|
||||
description = "Default time range (in seconds) for processing blocks"
|
||||
type = number
|
||||
default = 600 # 10 minutes
|
||||
}
|
||||
|
||||
variable "disable_provenance" {
|
||||
description = "Controls whether Grafana provisioning is disabled"
|
||||
type = bool
|
||||
default = true
|
||||
}
|
||||
|
||||
variable "alert_variables" {
|
||||
description = "Map of variables to substitute in alert YAML files using templatefile() syntax (e.g., $${variable_name})"
|
||||
type = map(string)
|
||||
default = {}
|
||||
}
|
||||
7
environments/modules/grafana_rule_group_test/versions.tf
Normal file
7
environments/modules/grafana_rule_group_test/versions.tf
Normal file
@ -0,0 +1,7 @@
|
||||
terraform {
|
||||
required_providers {
|
||||
grafana = {
|
||||
source = "grafana/grafana"
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user