225 lines
9.3 KiB
Terraform
225 lines
9.3 KiB
Terraform
|
|
resource "grafana_rule_group" "alert_groups" {
|
||
|
|
for_each = {
|
||
|
|
for k, v in local.grouped_alerts_by_datasource :
|
||
|
|
k => v if length(v.alert_files) > 0
|
||
|
|
}
|
||
|
|
|
||
|
|
# Main parameters
|
||
|
|
name = each.key
|
||
|
|
org_id = var.org_id
|
||
|
|
interval_seconds = var.default_evaluation_interval # Group-wide evaluation interval
|
||
|
|
folder_uid = each.value.folder_uid
|
||
|
|
disable_provenance = var.disable_provenance
|
||
|
|
|
||
|
|
# Rules configuration
|
||
|
|
dynamic "rule" {
|
||
|
|
for_each = each.value.alert_files
|
||
|
|
content {
|
||
|
|
name = "${rule.value.name} (${each.value.datasource_uid})"
|
||
|
|
# Business alerts: use math node D as condition
|
||
|
|
# Simple alerts: use threshold node T
|
||
|
|
condition = length(try(rule.value.content.functions, [])) > 0 ? try(rule.value.content.functions[length(rule.value.content.functions) - 1].math.ref_id, "T") : "T"
|
||
|
|
for = try(
|
||
|
|
coalesce(
|
||
|
|
# Try to parse duration string (e.g., "15m", "24h")
|
||
|
|
can(regex("^[0-9]+(s|m|h|d)$", rule.value.content.for)) ? format(
|
||
|
|
"%ds",
|
||
|
|
tonumber(regex("^([0-9]+)", rule.value.content.for)[0]) *
|
||
|
|
lookup(local.duration_units, regex("[smhd]$", rule.value.content.for), 1)
|
||
|
|
) : null,
|
||
|
|
# Fallback to frequency or default duration
|
||
|
|
format("%ds", try(rule.value.content.frequency, var.default_alert_duration))
|
||
|
|
),
|
||
|
|
format("%ds", var.default_alert_duration)
|
||
|
|
)
|
||
|
|
|
||
|
|
# Data configuration
|
||
|
|
# Single data block for the alert expression
|
||
|
|
# Both SQL and Prometheus alerts use a single expression
|
||
|
|
dynamic "data" {
|
||
|
|
for_each = can(rule.value.content.queries) ? [
|
||
|
|
for ref_id, query in rule.value.content.queries : {
|
||
|
|
ref_id = ref_id
|
||
|
|
query = query
|
||
|
|
}
|
||
|
|
] : [{ ref_id = "A", query = try(rule.value.content.expression, "") }]
|
||
|
|
content {
|
||
|
|
ref_id = data.value.ref_id
|
||
|
|
datasource_uid = each.value.datasource_uid
|
||
|
|
query_type = lookup(local.datasource_mapping_type, each.value.datasource_uid, "prometheus")
|
||
|
|
model = jsonencode(
|
||
|
|
merge(
|
||
|
|
{
|
||
|
|
refId = data.value.ref_id
|
||
|
|
intervalMs = var.default_interval_ms
|
||
|
|
maxDataPoints = var.default_max_data_points
|
||
|
|
instant = false
|
||
|
|
datasource = {
|
||
|
|
type = lookup(local.datasource_mapping_type, each.value.datasource_uid, "prometheus")
|
||
|
|
uid = each.value.datasource_uid
|
||
|
|
}
|
||
|
|
},
|
||
|
|
|
||
|
|
# The query type is determined by the datasource type
|
||
|
|
# The expression field contains the actual query for both SQL and Prometheus
|
||
|
|
# Query configuration based on datasource type
|
||
|
|
# Handle SQL-based datasources (mysql, clickhouse) and prometheus-compatible ones
|
||
|
|
# SQL query configuration (mysql, clickhouse)
|
||
|
|
contains(["grafana-clickhouse-datasource"],
|
||
|
|
lookup(local.datasource_mapping_type, each.value.datasource_uid, "prometheus")) ? {
|
||
|
|
# Use time_series format for queries with time grouping, table format for direct aggregation
|
||
|
|
format = can(regex("\\$__timeGroupAlias", data.value.query)) ? "time_series" : null
|
||
|
|
formatAs = can(regex("\\$__timeGroupAlias", data.value.query)) ? null : "table"
|
||
|
|
queryType = "sql"
|
||
|
|
rawSql = data.value.query
|
||
|
|
editorMode = "code"
|
||
|
|
editorType = "sql"
|
||
|
|
} : {
|
||
|
|
# Prometheus-compatible datasources (prometheus, victoriametrics)
|
||
|
|
expr = try(rule.value.content.expression, "")
|
||
|
|
format = "time_series"
|
||
|
|
queryType = lookup(local.datasource_mapping_type, each.value.datasource_uid, "prometheus")
|
||
|
|
}
|
||
|
|
)
|
||
|
|
)
|
||
|
|
relative_time_range {
|
||
|
|
from = try(
|
||
|
|
rule.value.content.relative_time_range.from, # First try alert's own config
|
||
|
|
lookup( # Then try folder settings
|
||
|
|
local.folder_time_ranges,
|
||
|
|
each.value.folder_uid,
|
||
|
|
var.default_time_range_from # Finally fallback to global default
|
||
|
|
)
|
||
|
|
)
|
||
|
|
to = 0
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
# Unified reduction processing
|
||
|
|
# Handle both function-based and simple reductions
|
||
|
|
dynamic "data" {
|
||
|
|
for_each = length(try(rule.value.content.functions, [])) > 0 ? [
|
||
|
|
for func in rule.value.content.functions : {
|
||
|
|
ref_id = try(func.reduce.ref_id, "B")
|
||
|
|
expression = try(func.reduce.input, "A")
|
||
|
|
reducer = try(func.reduce.function, "last")
|
||
|
|
mode = try(func.reduce.mode, "strict")
|
||
|
|
} if try(func.reduce, null) != null
|
||
|
|
] : try(rule.value.content.need_reduce, false) ? [{
|
||
|
|
ref_id = "B"
|
||
|
|
expression = "A"
|
||
|
|
# Map 'avg' reducer to 'mean' which is supported by Grafana
|
||
|
|
# Other reducers (last, max, min, sum) are already supported
|
||
|
|
reducer = try(
|
||
|
|
rule.value.content.reducer_type == "avg" ? "mean" : rule.value.content.reducer_type,
|
||
|
|
"last"
|
||
|
|
)
|
||
|
|
mode = "strict"
|
||
|
|
}] : []
|
||
|
|
content {
|
||
|
|
# Use exact ref_id and values from the for_each structure
|
||
|
|
ref_id = data.value.ref_id
|
||
|
|
datasource_uid = "__expr__"
|
||
|
|
model = jsonencode({
|
||
|
|
refId = data.value.ref_id
|
||
|
|
type = "reduce"
|
||
|
|
expression = data.value.expression
|
||
|
|
reducer = data.value.reducer
|
||
|
|
mode = data.value.mode
|
||
|
|
})
|
||
|
|
relative_time_range {
|
||
|
|
from = try(rule.value.content.relative_time_range.from, var.default_processing_range)
|
||
|
|
to = try(rule.value.content.relative_time_range.to, 0)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
# Math expressions processing
|
||
|
|
# Node references in Grafana alerts:
|
||
|
|
# - Node A: Initial query (SQL or Prometheus)
|
||
|
|
# - Node B: Reduction operation (created when need_reduce is true)
|
||
|
|
# - Node M: Math expression (optional, for complex calculations)
|
||
|
|
# - Node C: Final threshold evaluation
|
||
|
|
dynamic "data" {
|
||
|
|
for_each = length(try(rule.value.content.functions, [])) > 0 ? [
|
||
|
|
for idx, func in rule.value.content.functions : {
|
||
|
|
ref_id = try(func.math.ref_id, "M${idx}")
|
||
|
|
expression = func.math.expression
|
||
|
|
} if try(func.math, null) != null
|
||
|
|
] : try(rule.value.content.math_expression, null) != null ? [{
|
||
|
|
ref_id = "M"
|
||
|
|
expression = rule.value.content.math_expression
|
||
|
|
}] : []
|
||
|
|
content {
|
||
|
|
ref_id = data.value.ref_id
|
||
|
|
datasource_uid = "__expr__"
|
||
|
|
model = jsonencode({
|
||
|
|
refId = data.value.ref_id
|
||
|
|
type = "math"
|
||
|
|
expression = data.value.expression
|
||
|
|
input = try(rule.value.content.need_reduce ? "B" : "A", "A")
|
||
|
|
})
|
||
|
|
relative_time_range {
|
||
|
|
from = try(rule.value.content.relative_time_range.from, var.default_processing_range)
|
||
|
|
to = try(rule.value.content.relative_time_range.to, 0)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
# Unified threshold evaluation
|
||
|
|
data {
|
||
|
|
ref_id = "T" # Use T consistently for threshold
|
||
|
|
datasource_uid = "__expr__"
|
||
|
|
model = jsonencode({
|
||
|
|
refId = "T"
|
||
|
|
type = "threshold"
|
||
|
|
# Simple alerts: evaluate B (with reduction) or A (without reduction)
|
||
|
|
expression = try(rule.value.content.need_reduce ? "B" : "A", "A")
|
||
|
|
conditions = [
|
||
|
|
{
|
||
|
|
evaluator = merge(
|
||
|
|
{
|
||
|
|
type = rule.value.content.condition_type
|
||
|
|
},
|
||
|
|
# Handle range conditions for site monitoring
|
||
|
|
contains(["outside_range", "within_range"], rule.value.content.condition_type) ? {
|
||
|
|
params = rule.value.content.threshold_range
|
||
|
|
} : {
|
||
|
|
# Handle single threshold for business/system alerts
|
||
|
|
params = [rule.value.content.threshold]
|
||
|
|
}
|
||
|
|
)
|
||
|
|
operator = { type = "and" }
|
||
|
|
# Use 'mean' reducer for consistency (same as mapping 'avg' to 'mean' in reductions)
|
||
|
|
reducer = { type = "mean", params = [] }
|
||
|
|
query = { params = [] }
|
||
|
|
type = "query"
|
||
|
|
}
|
||
|
|
]
|
||
|
|
})
|
||
|
|
relative_time_range {
|
||
|
|
from = try(rule.value.content.relative_time_range.from, var.default_processing_range)
|
||
|
|
to = try(rule.value.content.relative_time_range.to, 0)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
# Rule metadata
|
||
|
|
annotations = {
|
||
|
|
summary = rule.value.content.summary
|
||
|
|
description = try(rule.value.content.description, "")
|
||
|
|
threshold = try(
|
||
|
|
contains(["outside_range", "within_range"], try(rule.value.content.condition_type, "gt")) ?
|
||
|
|
tostring(try(rule.value.content.threshold_range[0], "")) :
|
||
|
|
tostring(try(rule.value.content.threshold, ""))
|
||
|
|
)
|
||
|
|
}
|
||
|
|
|
||
|
|
labels = rule.value.content.labels
|
||
|
|
|
||
|
|
no_data_state = try(rule.value.content.no_data_state, var.default_no_data_state)
|
||
|
|
exec_err_state = try(rule.value.content.exec_err_state, var.default_exec_err_state)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|