Files

225 lines
9.3 KiB
Terraform
Raw Permalink Normal View History

resource "grafana_rule_group" "alert_groups" {
for_each = {
for k, v in local.grouped_alerts_by_datasource :
k => v if length(v.alert_files) > 0
}
# Main parameters
name = each.key
org_id = var.org_id
interval_seconds = var.default_evaluation_interval # Group-wide evaluation interval
folder_uid = each.value.folder_uid
disable_provenance = var.disable_provenance
# Rules configuration
dynamic "rule" {
for_each = each.value.alert_files
content {
name = "${rule.value.name} (${each.value.datasource_uid})"
# Business alerts: use math node D as condition
# Simple alerts: use threshold node T
condition = length(try(rule.value.content.functions, [])) > 0 ? try(rule.value.content.functions[length(rule.value.content.functions) - 1].math.ref_id, "T") : "T"
for = try(
coalesce(
# Try to parse duration string (e.g., "15m", "24h")
can(regex("^[0-9]+(s|m|h|d)$", rule.value.content.for)) ? format(
"%ds",
tonumber(regex("^([0-9]+)", rule.value.content.for)[0]) *
lookup(local.duration_units, regex("[smhd]$", rule.value.content.for), 1)
) : null,
# Fallback to frequency or default duration
format("%ds", try(rule.value.content.frequency, var.default_alert_duration))
),
format("%ds", var.default_alert_duration)
)
# Data configuration
# Single data block for the alert expression
# Both SQL and Prometheus alerts use a single expression
dynamic "data" {
for_each = can(rule.value.content.queries) ? [
for ref_id, query in rule.value.content.queries : {
ref_id = ref_id
query = query
}
] : [{ ref_id = "A", query = try(rule.value.content.expression, "") }]
content {
ref_id = data.value.ref_id
datasource_uid = each.value.datasource_uid
query_type = lookup(local.datasource_mapping_type, each.value.datasource_uid, "prometheus")
model = jsonencode(
merge(
{
refId = data.value.ref_id
intervalMs = var.default_interval_ms
maxDataPoints = var.default_max_data_points
instant = false
datasource = {
type = lookup(local.datasource_mapping_type, each.value.datasource_uid, "prometheus")
uid = each.value.datasource_uid
}
},
# The query type is determined by the datasource type
# The expression field contains the actual query for both SQL and Prometheus
# Query configuration based on datasource type
# Handle SQL-based datasources (mysql, clickhouse) and prometheus-compatible ones
# SQL query configuration (mysql, clickhouse)
contains(["grafana-clickhouse-datasource"],
lookup(local.datasource_mapping_type, each.value.datasource_uid, "prometheus")) ? {
# Use time_series format for queries with time grouping, table format for direct aggregation
format = can(regex("\\$__timeGroupAlias", data.value.query)) ? "time_series" : null
formatAs = can(regex("\\$__timeGroupAlias", data.value.query)) ? null : "table"
queryType = "sql"
rawSql = data.value.query
editorMode = "code"
editorType = "sql"
} : {
# Prometheus-compatible datasources (prometheus, victoriametrics)
expr = try(rule.value.content.expression, "")
format = "time_series"
queryType = lookup(local.datasource_mapping_type, each.value.datasource_uid, "prometheus")
}
)
)
relative_time_range {
from = try(
rule.value.content.relative_time_range.from, # First try alert's own config
lookup( # Then try folder settings
local.folder_time_ranges,
each.value.folder_uid,
var.default_time_range_from # Finally fallback to global default
)
)
to = 0
}
}
}
# Unified reduction processing
# Handle both function-based and simple reductions
dynamic "data" {
for_each = length(try(rule.value.content.functions, [])) > 0 ? [
for func in rule.value.content.functions : {
ref_id = try(func.reduce.ref_id, "B")
expression = try(func.reduce.input, "A")
reducer = try(func.reduce.function, "last")
mode = try(func.reduce.mode, "strict")
} if try(func.reduce, null) != null
] : try(rule.value.content.need_reduce, false) ? [{
ref_id = "B"
expression = "A"
# Map 'avg' reducer to 'mean' which is supported by Grafana
# Other reducers (last, max, min, sum) are already supported
reducer = try(
rule.value.content.reducer_type == "avg" ? "mean" : rule.value.content.reducer_type,
"last"
)
mode = "strict"
}] : []
content {
# Use exact ref_id and values from the for_each structure
ref_id = data.value.ref_id
datasource_uid = "__expr__"
model = jsonencode({
refId = data.value.ref_id
type = "reduce"
expression = data.value.expression
reducer = data.value.reducer
mode = data.value.mode
})
relative_time_range {
from = try(rule.value.content.relative_time_range.from, var.default_processing_range)
to = try(rule.value.content.relative_time_range.to, 0)
}
}
}
# Math expressions processing
# Node references in Grafana alerts:
# - Node A: Initial query (SQL or Prometheus)
# - Node B: Reduction operation (created when need_reduce is true)
# - Node M: Math expression (optional, for complex calculations)
# - Node C: Final threshold evaluation
dynamic "data" {
for_each = length(try(rule.value.content.functions, [])) > 0 ? [
for idx, func in rule.value.content.functions : {
ref_id = try(func.math.ref_id, "M${idx}")
expression = func.math.expression
} if try(func.math, null) != null
] : try(rule.value.content.math_expression, null) != null ? [{
ref_id = "M"
expression = rule.value.content.math_expression
}] : []
content {
ref_id = data.value.ref_id
datasource_uid = "__expr__"
model = jsonencode({
refId = data.value.ref_id
type = "math"
expression = data.value.expression
input = try(rule.value.content.need_reduce ? "B" : "A", "A")
})
relative_time_range {
from = try(rule.value.content.relative_time_range.from, var.default_processing_range)
to = try(rule.value.content.relative_time_range.to, 0)
}
}
}
# Unified threshold evaluation
data {
ref_id = "T" # Use T consistently for threshold
datasource_uid = "__expr__"
model = jsonencode({
refId = "T"
type = "threshold"
# Simple alerts: evaluate B (with reduction) or A (without reduction)
expression = try(rule.value.content.need_reduce ? "B" : "A", "A")
conditions = [
{
evaluator = merge(
{
type = rule.value.content.condition_type
},
# Handle range conditions for site monitoring
contains(["outside_range", "within_range"], rule.value.content.condition_type) ? {
params = rule.value.content.threshold_range
} : {
# Handle single threshold for business/system alerts
params = [rule.value.content.threshold]
}
)
operator = { type = "and" }
# Use 'mean' reducer for consistency (same as mapping 'avg' to 'mean' in reductions)
reducer = { type = "mean", params = [] }
query = { params = [] }
type = "query"
}
]
})
relative_time_range {
from = try(rule.value.content.relative_time_range.from, var.default_processing_range)
to = try(rule.value.content.relative_time_range.to, 0)
}
}
# Rule metadata
annotations = {
summary = rule.value.content.summary
description = try(rule.value.content.description, "")
threshold = try(
contains(["outside_range", "within_range"], try(rule.value.content.condition_type, "gt")) ?
tostring(try(rule.value.content.threshold_range[0], "")) :
tostring(try(rule.value.content.threshold, ""))
)
}
labels = rule.value.content.labels
no_data_state = try(rule.value.content.no_data_state, var.default_no_data_state)
exec_err_state = try(rule.value.content.exec_err_state, var.default_exec_err_state)
}
}
}