release: v1.9.0 — IRM-алерты отдельно от инцидентов
Some checks failed
Deploy / deploy (push) Has been cancelled
CI / test (push) Successful in 37s

- Alembic 005: таблицы irm_alerts и incident_alert_links
- Модуль alerts: API/UI, Ack/Resolve, привязка к инциденту через alert_ids
- Вебхук Grafana: одна транзакция ingress + irm_alerts; разбор payload в grafana_payload
- По умолчанию инцидент из вебхука не создаётся (AUTO_INCIDENT_FROM_ALERT)
- Документация IRM_GRAFANA_PARITY.md, обновления IRM.md и CHANGELOG

Made-with: Cursor
This commit is contained in:
Alexandr
2026-04-03 15:26:38 +03:00
parent 3cb75eb7b7
commit a8ccf1d35c
19 changed files with 722 additions and 60 deletions

View File

@ -1,3 +1,3 @@
"""onGuard24 — модульный монолит (ядро + модули)."""
__version__ = "1.8.0"
__version__ = "1.9.0"

View File

@ -34,6 +34,11 @@ class Settings(BaseSettings):
forgejo_url: str = Field(default="", validation_alias="FORGEJO_URL")
forgejo_token: str = Field(default="", validation_alias="FORGEJO_TOKEN")
log_level: str = Field(default="info", validation_alias="LOG_LEVEL")
# Устаревшее: автосоздание инцидента на каждый вебхук (без учёта irm_alerts). По умолчанию выкл.
auto_incident_from_alert: bool = Field(
default=False,
validation_alias="AUTO_INCIDENT_FROM_ALERT",
)
def get_settings() -> Settings:

View File

@ -9,6 +9,7 @@ from starlette.responses import Response
from onguard24.domain.entities import Alert, Severity
from onguard24.grafana_sources import sources_by_slug, webhook_authorized
from onguard24.ingress.grafana_payload import extract_alert_row_from_grafana_body
logger = logging.getLogger(__name__)
router = APIRouter(tags=["ingress"])
@ -119,19 +120,38 @@ async def _grafana_webhook_impl(
logger.warning("ingress: database not configured, event not persisted")
return Response(status_code=202)
title_row, sev_row, labels_row, fp_row = extract_alert_row_from_grafana_body(body)
async with pool.acquire() as conn:
row = await conn.fetchrow(
"""
INSERT INTO ingress_events (source, body, org_slug, service_name)
VALUES ($1, $2::jsonb, $3, $4)
RETURNING id
""",
"grafana",
json.dumps(body),
stored_org_slug,
service_name,
)
raw_id = row["id"] if row else None
async with conn.transaction():
row = await conn.fetchrow(
"""
INSERT INTO ingress_events (source, body, org_slug, service_name)
VALUES ($1, $2::jsonb, $3, $4)
RETURNING id
""",
"grafana",
json.dumps(body),
stored_org_slug,
service_name,
)
raw_id = row["id"] if row else None
if raw_id is not None:
await conn.execute(
"""
INSERT INTO irm_alerts (
ingress_event_id, status, title, severity, source,
grafana_org_slug, service_name, labels, fingerprint
)
VALUES ($1, 'firing', $2, $3, 'grafana', $4, $5, $6::jsonb, $7)
""",
raw_id,
title_row or "",
sev_row,
stored_org_slug,
service_name,
json.dumps(labels_row),
fp_row,
)
bus = getattr(request.app.state, "event_bus", None)
if bus and raw_id is not None:
title = str(body.get("title") or body.get("ruleName") or "")[:500]

View File

@ -0,0 +1,53 @@
"""Извлечение полей для учёта алерта из тела вебхука Grafana (Unified Alerting)."""
from __future__ import annotations
import json
from typing import Any
def extract_alert_row_from_grafana_body(body: dict[str, Any]) -> tuple[str, str, dict[str, Any], str | None]:
"""
Возвращает: title, severity (info|warning|critical), labels (dict), fingerprint.
"""
title = str(body.get("title") or body.get("ruleName") or "").strip()[:500]
alerts = body.get("alerts")
labels: dict[str, Any] = {}
fingerprint: str | None = None
sev = "warning"
if isinstance(alerts, list) and alerts and isinstance(alerts[0], dict):
a0 = alerts[0]
fp = a0.get("fingerprint")
if fp is not None:
fingerprint = str(fp)[:500]
if isinstance(a0.get("labels"), dict):
labels.update(a0["labels"])
ann = a0.get("annotations")
if isinstance(ann, dict) and not title:
title = str(ann.get("summary") or ann.get("description") or "").strip()[:500]
cl = body.get("commonLabels")
if isinstance(cl, dict):
for k, v in cl.items():
labels.setdefault(k, v)
if not title and isinstance(alerts, list) and alerts and isinstance(alerts[0], dict):
title = str(alerts[0].get("labels", {}).get("alertname") or "").strip()[:500]
raw_s = None
if isinstance(labels.get("severity"), str):
raw_s = labels["severity"].lower()
elif isinstance(labels.get("priority"), str):
raw_s = labels["priority"].lower()
if raw_s in ("critical", "error", "fatal"):
sev = "critical"
elif raw_s in ("warning", "warn"):
sev = "warning"
elif raw_s in ("info", "informational", "none"):
sev = "info"
# JSONB: только JSON-совместимые значения
clean_labels = {str(k): v for k, v in labels.items() if isinstance(v, (str, int, float, bool, type(None)))}
return title, sev, clean_labels, fingerprint

346
onguard24/modules/alerts.py Normal file
View File

@ -0,0 +1,346 @@
"""Учёт входящих алертов (отдельно от инцидентов): firing → acknowledged → resolved."""
from __future__ import annotations
import html
import json
import logging
from uuid import UUID
import asyncpg
from fastapi import APIRouter, Depends, HTTPException, Request
from fastapi.responses import HTMLResponse
from pydantic import BaseModel, Field
from onguard24.deps import get_pool
from onguard24.domain.events import EventBus
from onguard24.modules.ui_support import wrap_module_html_page
log = logging.getLogger(__name__)
router = APIRouter(tags=["module-alerts"])
ui_router = APIRouter(tags=["web-alerts"], include_in_schema=False)
_VALID_STATUS = frozenset({"firing", "acknowledged", "resolved", "silenced"})
def register_events(_bus: EventBus, _pool: asyncpg.Pool | None = None) -> None:
pass
class AckBody(BaseModel):
by_user: str | None = Field(default=None, max_length=200, description="Кто подтвердил")
class ResolveBody(BaseModel):
by_user: str | None = Field(default=None, max_length=200)
def _row_to_item(r: asyncpg.Record) -> dict:
return {
"id": str(r["id"]),
"ingress_event_id": str(r["ingress_event_id"]),
"status": r["status"],
"title": r["title"],
"severity": r["severity"],
"source": r["source"],
"grafana_org_slug": r["grafana_org_slug"],
"service_name": r["service_name"],
"labels": r["labels"] if isinstance(r["labels"], dict) else {},
"fingerprint": r["fingerprint"],
"acknowledged_at": r["acknowledged_at"].isoformat() if r["acknowledged_at"] else None,
"acknowledged_by": r["acknowledged_by"],
"resolved_at": r["resolved_at"].isoformat() if r["resolved_at"] else None,
"resolved_by": r["resolved_by"],
"created_at": r["created_at"].isoformat() if r["created_at"] else None,
"updated_at": r["updated_at"].isoformat() if r["updated_at"] else None,
}
@router.get("/")
async def list_alerts_api(
pool: asyncpg.Pool | None = Depends(get_pool),
status: str | None = None,
limit: int = 100,
):
if pool is None:
return {"items": [], "database": "disabled"}
limit = min(max(limit, 1), 200)
st = (status or "").strip().lower()
if st and st not in _VALID_STATUS:
raise HTTPException(status_code=400, detail="invalid status filter")
async with pool.acquire() as conn:
if st:
rows = await conn.fetch(
"""
SELECT * FROM irm_alerts WHERE status = $1
ORDER BY created_at DESC LIMIT $2
""",
st,
limit,
)
else:
rows = await conn.fetch(
"""
SELECT * FROM irm_alerts
ORDER BY created_at DESC LIMIT $1
""",
limit,
)
return {"items": [_row_to_item(r) for r in rows]}
@router.get("/{alert_id}")
async def get_alert_api(alert_id: UUID, pool: asyncpg.Pool | None = Depends(get_pool)):
if pool is None:
raise HTTPException(status_code=503, detail="database disabled")
async with pool.acquire() as conn:
row = await conn.fetchrow("SELECT * FROM irm_alerts WHERE id = $1::uuid", alert_id)
raw = None
if row and row.get("ingress_event_id"):
raw = await conn.fetchrow(
"SELECT id, body, received_at FROM ingress_events WHERE id = $1::uuid",
row["ingress_event_id"],
)
if not row:
raise HTTPException(status_code=404, detail="not found")
out = _row_to_item(row)
if raw:
out["raw_received_at"] = raw["received_at"].isoformat() if raw["received_at"] else None
body = raw["body"]
out["raw_body"] = dict(body) if hasattr(body, "keys") else body
else:
out["raw_received_at"] = None
out["raw_body"] = None
return out
@router.patch("/{alert_id}/acknowledge", status_code=200)
async def acknowledge_alert_api(
alert_id: UUID,
body: AckBody,
pool: asyncpg.Pool | None = Depends(get_pool),
):
if pool is None:
raise HTTPException(status_code=503, detail="database disabled")
who = (body.by_user or "").strip() or None
async with pool.acquire() as conn:
row = await conn.fetchrow(
"""
UPDATE irm_alerts SET
status = 'acknowledged',
acknowledged_at = now(),
acknowledged_by = COALESCE($2, acknowledged_by),
updated_at = now()
WHERE id = $1::uuid AND status = 'firing'
RETURNING *
""",
alert_id,
who,
)
if not row:
raise HTTPException(status_code=409, detail="alert not in firing state or not found")
return _row_to_item(row)
@router.patch("/{alert_id}/resolve", status_code=200)
async def resolve_alert_api(
alert_id: UUID,
body: ResolveBody,
pool: asyncpg.Pool | None = Depends(get_pool),
):
if pool is None:
raise HTTPException(status_code=503, detail="database disabled")
who = (body.by_user or "").strip() or None
async with pool.acquire() as conn:
row = await conn.fetchrow(
"""
UPDATE irm_alerts SET
status = 'resolved',
resolved_at = now(),
resolved_by = COALESCE($2, resolved_by),
updated_at = now()
WHERE id = $1::uuid AND status IN ('firing', 'acknowledged')
RETURNING *
""",
alert_id,
who,
)
if not row:
raise HTTPException(
status_code=409,
detail="alert cannot be resolved from current state or not found",
)
return _row_to_item(row)
_SYNC_BTN_STYLE = """
<script>
function ogAck(aid){fetch('/api/v1/modules/alerts/'+aid+'/acknowledge',{method:'PATCH',headers:{'Content-Type':'application/json'},body:JSON.stringify({})}).then(r=>{if(r.ok)location.reload();else r.text().then(t=>alert('Ошибка '+r.status+': '+t.slice(0,200)));});}
function ogRes(aid){fetch('/api/v1/modules/alerts/'+aid+'/resolve',{method:'PATCH',headers:{'Content-Type':'application/json'},body:JSON.stringify({})}).then(r=>{if(r.ok)location.reload();else r.text().then(t=>alert('Ошибка '+r.status+': '+t.slice(0,200)));});}
function ogInc(aid,title){var t=prompt('Заголовок инцидента',title||'');if(t===null)return;fetch('/api/v1/modules/incidents/',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({title:t,alert_ids:[aid]})}).then(r=>{if(r.ok)r.json().then(j=>location.href='/ui/modules/incidents/'+j.id);else r.text().then(x=>alert('Ошибка '+r.status+': '+x.slice(0,200)));});}
</script>
"""
@ui_router.get("/", response_class=HTMLResponse)
async def alerts_ui_list(request: Request):
pool = get_pool(request)
body = ""
if pool is None:
body = "<p>База не настроена.</p>"
else:
try:
async with pool.acquire() as conn:
rows = await conn.fetch(
"""
SELECT id, status, title, severity, grafana_org_slug, service_name, created_at, fingerprint
FROM irm_alerts
ORDER BY created_at DESC
LIMIT 150
"""
)
if not rows:
body = "<p>Пока нет алертов. События появляются после вебхука Grafana.</p>"
else:
trs = []
for r in rows:
aid = str(r["id"])
trs.append(
"<tr>"
f"<td>{html.escape(r['status'])}</td>"
f"<td><a href=\"/ui/modules/alerts/{html.escape(aid, quote=True)}\">"
f"{html.escape(aid[:8])}…</a></td>"
f"<td>{html.escape((r['title'] or '')[:200])}</td>"
f"<td>{html.escape(r['severity'])}</td>"
f"<td>{html.escape(str(r['grafana_org_slug'] or ''))}</td>"
f"<td>{html.escape(str(r['service_name'] or ''))}</td>"
f"<td>{html.escape(r['created_at'].isoformat() if r['created_at'] else '')}</td>"
"</tr>"
)
body = (
"<p class='gc-muted'>Алерт — запись о входящем уведомлении. "
"<strong>Инцидент</strong> создаётся вручную (из карточки алерта или раздела «Инциденты») "
"и может ссылаться на один или несколько алертов.</p>"
"<table class='irm-table'><thead><tr><th>Статус</th><th>ID</th><th>Заголовок</th>"
"<th>Важность</th><th>Grafana slug</th><th>Сервис</th><th>Создан</th></tr></thead><tbody>"
+ "".join(trs)
+ "</tbody></table>"
)
except Exception as e:
body = f"<p class='module-err'>{html.escape(str(e))}</p>"
page = f"<h1>Алерты</h1>{body}{_SYNC_BTN_STYLE}"
return HTMLResponse(
wrap_module_html_page(
document_title="Алерты — onGuard24",
current_slug="alerts",
main_inner_html=page,
)
)
@ui_router.get("/{alert_id:uuid}", response_class=HTMLResponse)
async def alerts_ui_detail(request: Request, alert_id: UUID):
pool = get_pool(request)
if pool is None:
return HTMLResponse(
wrap_module_html_page(
document_title="Алерт — onGuard24",
current_slug="alerts",
main_inner_html="<h1>Алерт</h1><p>База не настроена.</p>",
)
)
try:
async with pool.acquire() as conn:
row = await conn.fetchrow("SELECT * FROM irm_alerts WHERE id = $1::uuid", alert_id)
raw = None
if row and row.get("ingress_event_id"):
raw = await conn.fetchrow(
"SELECT body, received_at FROM ingress_events WHERE id = $1::uuid",
row["ingress_event_id"],
)
except Exception as e:
return HTMLResponse(
wrap_module_html_page(
document_title="Алерт — onGuard24",
current_slug="alerts",
main_inner_html=f"<h1>Алерт</h1><p class='module-err'>{html.escape(str(e))}</p>",
)
)
if not row:
inner = "<p>Не найдено.</p>"
else:
aid = str(row["id"])
st = row["status"]
title_js = json.dumps(row["title"] or "")
btns = []
if st == "firing":
btns.append(
f"<button type='button' class='og-btn og-btn-primary' "
f"onclick=\"ogAck('{html.escape(aid, quote=True)}')\">Подтвердить (Ack)</button>"
)
if st in ("firing", "acknowledged"):
btns.append(
f"<button type='button' class='og-btn' "
f"onclick=\"ogRes('{html.escape(aid, quote=True)}')\">Resolve</button>"
)
btns.append(
f"<button type='button' class='og-btn' "
f"onclick=\"ogInc('{html.escape(aid, quote=True)}',{title_js})\">"
"Создать инцидент</button>"
)
lab = row["labels"]
lab_s = json.dumps(dict(lab), ensure_ascii=False, indent=2) if isinstance(lab, dict) else "{}"
raw_pre = ""
if raw:
b = raw["body"]
pretty = json.dumps(dict(b), ensure_ascii=False, indent=2) if hasattr(b, "keys") else str(b)
if len(pretty) > 14000:
pretty = pretty[:14000] + "\n"
raw_pre = (
"<h2 style='font-size:1.05rem;margin-top:1rem'>Полное тело вебхука</h2>"
f"<pre style='overflow:auto;max-height:26rem;font-size:0.78rem;"
f"background:#18181b;color:#e4e4e7;padding:0.75rem;border-radius:8px'>"
f"{html.escape(pretty)}</pre>"
)
inner = (
f"<p><a href=\"/ui/modules/alerts/\">← К списку алертов</a></p>"
f"<h1>Алерт</h1><div class='og-sync-bar'>{''.join(btns)}</div>"
f"<dl style='display:grid;grid-template-columns:11rem 1fr;gap:0.35rem 1rem;font-size:0.9rem'>"
f"<dt>ID</dt><dd><code>{html.escape(aid)}</code></dd>"
f"<dt>Статус</dt><dd>{html.escape(st)}</dd>"
f"<dt>Заголовок</dt><dd>{html.escape(row['title'] or '')}</dd>"
f"<dt>Важность</dt><dd>{html.escape(row['severity'])}</dd>"
f"<dt>Grafana slug</dt><dd>{html.escape(str(row['grafana_org_slug'] or ''))}</dd>"
f"<dt>Сервис</dt><dd>{html.escape(str(row['service_name'] or ''))}</dd>"
f"<dt>Fingerprint</dt><dd><code>{html.escape(str(row['fingerprint'] or ''))}</code></dd>"
f"<dt>Labels</dt><dd><pre style='margin:0;font-size:0.8rem'>{html.escape(lab_s)}</pre></dd>"
f"</dl>{raw_pre}"
)
page = f"{inner}{_SYNC_BTN_STYLE}"
return HTMLResponse(
wrap_module_html_page(
document_title="Алерт — onGuard24",
current_slug="alerts",
main_inner_html=page,
)
)
async def render_home_fragment(request: Request) -> str:
pool = get_pool(request)
if pool is None:
return '<p class="module-note">Нужна БД для учёта алертов.</p>'
try:
async with pool.acquire() as conn:
n = await conn.fetchval("SELECT count(*)::int FROM irm_alerts")
nf = await conn.fetchval(
"SELECT count(*)::int FROM irm_alerts WHERE status = 'firing'"
)
except Exception:
return '<p class="module-note">Таблица алертов недоступна (миграция 005?).</p>'
return (
f'<div class="module-fragment"><p>Алертов в учёте: <strong>{int(n)}</strong> '
f'(<strong>{int(nf)}</strong> firing). '
f'<a href="/ui/modules/alerts/">Открыть</a></p></div>'
)

View File

@ -12,6 +12,7 @@ from fastapi import APIRouter, Depends, HTTPException, Request
from fastapi.responses import HTMLResponse
from pydantic import BaseModel, Field
from onguard24.config import get_settings
from onguard24.deps import get_pool
from onguard24.domain.events import AlertReceived, DomainEvent, EventBus
from onguard24.modules.ui_support import wrap_module_html_page
@ -26,6 +27,7 @@ class IncidentCreate(BaseModel):
title: str = Field(..., min_length=1, max_length=500)
status: str = Field(default="open", max_length=64)
severity: str = Field(default="warning", max_length=32)
alert_ids: list[UUID] = Field(default_factory=list, description="Привязка к irm_alerts")
class IncidentPatch(BaseModel):
@ -39,6 +41,8 @@ def register_events(bus: EventBus, pool: asyncpg.Pool | None = None) -> None:
return
async def on_alert(ev: DomainEvent) -> None:
if not get_settings().auto_incident_from_alert:
return
if not isinstance(ev, AlertReceived) or ev.raw_payload_ref is None:
return
a = ev.alert
@ -136,17 +140,29 @@ async def create_incident_api(
if pool is None:
raise HTTPException(status_code=503, detail="database disabled")
async with pool.acquire() as conn:
row = await conn.fetchrow(
"""
INSERT INTO incidents (title, status, severity, source, grafana_org_slug, service_name)
VALUES ($1, $2, $3, 'manual', NULL, NULL)
RETURNING id, title, status, severity, source, ingress_event_id, created_at, updated_at,
grafana_org_slug, service_name
""",
body.title.strip(),
body.status,
body.severity,
)
async with conn.transaction():
row = await conn.fetchrow(
"""
INSERT INTO incidents (title, status, severity, source, grafana_org_slug, service_name)
VALUES ($1, $2, $3, 'manual', NULL, NULL)
RETURNING id, title, status, severity, source, ingress_event_id, created_at, updated_at,
grafana_org_slug, service_name
""",
body.title.strip(),
body.status,
body.severity,
)
iid = row["id"]
for aid in body.alert_ids[:50]:
await conn.execute(
"""
INSERT INTO incident_alert_links (incident_id, alert_id)
VALUES ($1::uuid, $2::uuid)
ON CONFLICT DO NOTHING
""",
iid,
aid,
)
return {
"id": str(row["id"]),
"title": row["title"],
@ -312,7 +328,7 @@ async def incidents_ui_home(request: Request):
<thead><tr><th>ID</th><th>Заголовок</th><th>Статус</th><th>Важность</th><th>Источник</th><th>Grafana slug</th><th>Сервис</th><th>Создан</th></tr></thead>
<tbody>{rows_html or '<tr><td colspan="8">Пока нет записей</td></tr>'}</tbody>
</table>
<p><small>Создание из Grafana: webhook → <code>ingress_events</code> → событие → строка здесь. Пустой заголовок бывает при тестовом JSON без полей алерта.</small></p>"""
<p><small>Сначала вебхук создаёт <a href="/ui/modules/alerts/">алерт</a> (учёт, Ack/Resolve). Инцидент — отдельная сущность: создаётся вручную или из карточки алерта, к нему можно привязать один или несколько алертов. Пустой заголовок в списке — часто тестовый JSON без полей правила.</small></p>"""
return HTMLResponse(
wrap_module_html_page(
document_title="Инциденты — onGuard24",

View File

@ -14,6 +14,7 @@ from starlette.requests import Request
from onguard24.domain.events import EventBus
from onguard24.modules import (
alerts,
contacts,
escalations,
grafana_catalog,
@ -52,6 +53,15 @@ def _mounts() -> list[ModuleMount]:
ui_router=grafana_catalog.ui_router,
render_home_fragment=grafana_catalog.render_home_fragment,
),
ModuleMount(
router=alerts.router,
url_prefix="/api/v1/modules/alerts",
register_events=alerts.register_events,
slug="alerts",
title="Алерты",
ui_router=alerts.ui_router,
render_home_fragment=alerts.render_home_fragment,
),
ModuleMount(
router=incidents.router,
url_prefix="/api/v1/modules/incidents",