From a8ccf1d35c40475f270f939fba28bfddf7a1ee6f Mon Sep 17 00:00:00 2001 From: Alexandr Date: Fri, 3 Apr 2026 15:26:38 +0300 Subject: [PATCH] =?UTF-8?q?release:=20v1.9.0=20=E2=80=94=20IRM-=D0=B0?= =?UTF-8?q?=D0=BB=D0=B5=D1=80=D1=82=D1=8B=20=D0=BE=D1=82=D0=B4=D0=B5=D0=BB?= =?UTF-8?q?=D1=8C=D0=BD=D0=BE=20=D0=BE=D1=82=20=D0=B8=D0=BD=D1=86=D0=B8?= =?UTF-8?q?=D0=B4=D0=B5=D0=BD=D1=82=D0=BE=D0=B2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Alembic 005: таблицы irm_alerts и incident_alert_links - Модуль alerts: API/UI, Ack/Resolve, привязка к инциденту через alert_ids - Вебхук Grafana: одна транзакция ingress + irm_alerts; разбор payload в grafana_payload - По умолчанию инцидент из вебхука не создаётся (AUTO_INCIDENT_FROM_ALERT) - Документация IRM_GRAFANA_PARITY.md, обновления IRM.md и CHANGELOG Made-with: Cursor --- .env.example | 3 + CHANGELOG.md | 17 ++ alembic/versions/005_irm_alerts.py | 74 ++++++ docs/IRM.md | 12 +- docs/IRM_GRAFANA_PARITY.md | 39 +++ onguard24/__init__.py | 2 +- onguard24/config.py | 5 + onguard24/ingress/grafana.py | 44 +++- onguard24/ingress/grafana_payload.py | 53 ++++ onguard24/modules/alerts.py | 346 +++++++++++++++++++++++++++ onguard24/modules/incidents.py | 40 +++- onguard24/modules/registry.py | 10 + pyproject.toml | 2 +- tests/irm_db_fake.py | 13 + tests/test_alerts_api.py | 9 + tests/test_grafana_payload.py | 26 ++ tests/test_ingress.py | 48 ++-- tests/test_irm_modules.py | 37 ++- tests/test_root_ui.py | 2 + 19 files changed, 722 insertions(+), 60 deletions(-) create mode 100644 alembic/versions/005_irm_alerts.py create mode 100644 docs/IRM_GRAFANA_PARITY.md create mode 100644 onguard24/ingress/grafana_payload.py create mode 100644 onguard24/modules/alerts.py create mode 100644 tests/test_alerts_api.py create mode 100644 tests/test_grafana_payload.py diff --git a/.env.example b/.env.example index 533fef5..c83e615 100644 --- a/.env.example +++ b/.env.example @@ -6,6 +6,9 @@ LOG_LEVEL=info # Опционально: общий секрет для вебхуков (если у источника в JSON не задан свой webhook_secret) # GRAFANA_WEBHOOK_SECRET= +# Устаревшее: автосоздание инцидента на каждый вебхук (дублирует irm_alerts). Обычно не нужно. +# AUTO_INCIDENT_FROM_ALERT=1 + # Несколько Grafana: JSON-массив. slug — часть URL вебхука: /api/v1/ingress/grafana/{slug} # Пример: [{"slug":"adibrov","api_url":"https://grafana-adibrov.example","api_token":"glsa_...","webhook_secret":"длинный-секрет"}] # Если пусто, но задан GRAFANA_URL — один источник со slug "default" (вебхук /api/v1/ingress/grafana/default) diff --git a/CHANGELOG.md b/CHANGELOG.md index 836ba60..54398f4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,23 @@ Формат: семантическое версионирование `MAJOR.MINOR.PATCH`. Git-теги `v1.0.0`, `v1.1.0` и т.д. — см. [docs/VERSIONING.md](docs/VERSIONING.md). +## [1.9.0] — 2026-04-03 + +Алерты отдельно от инцидентов (модель ближе к Grafana IRM). + +### Добавлено + +- **Alembic `005_irm_alerts`:** таблицы `irm_alerts`, `incident_alert_links`. +- **Модуль «Алерты»:** API и UI, статусы firing → acknowledged → resolved, полный JSON вебхука, кнопка «Создать инцидент». +- **Вебхук Grafana:** в одной транзакции `ingress_events` + `irm_alerts`. +- **`extract_alert_row_from_grafana_body`** — заголовок, severity, labels, fingerprint. +- **Документация:** [docs/IRM_GRAFANA_PARITY.md](docs/IRM_GRAFANA_PARITY.md). + +### Изменено + +- **Инцидент из вебхука по умолчанию не создаётся**; включение старого поведения: `AUTO_INCIDENT_FROM_ALERT=1`. +- **POST /incidents:** опционально `alert_ids` для привязки к `irm_alerts`. + ## [1.8.0] — 2026-04-03 UI каталога Grafana и инцидентов; правки CI/CD деплоя. diff --git a/alembic/versions/005_irm_alerts.py b/alembic/versions/005_irm_alerts.py new file mode 100644 index 0000000..4f153dc --- /dev/null +++ b/alembic/versions/005_irm_alerts.py @@ -0,0 +1,74 @@ +"""IRM: алерты отдельно от инцидентов (ack/resolve), связь N:M инцидент↔алерт + +Revision ID: 005_irm_alerts +Revises: 004_grafana_catalog +Create Date: 2026-04-03 + +""" + +from typing import Sequence, Union + +from alembic import op + +revision: str = "005_irm_alerts" +down_revision: Union[str, None] = "004_grafana_catalog" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.execute( + """ + CREATE TABLE IF NOT EXISTS irm_alerts ( + id uuid PRIMARY KEY DEFAULT gen_random_uuid(), + ingress_event_id uuid NOT NULL UNIQUE REFERENCES ingress_events(id) ON DELETE CASCADE, + status text NOT NULL DEFAULT 'firing' + CHECK (status IN ('firing', 'acknowledged', 'resolved', 'silenced')), + title text NOT NULL DEFAULT '', + severity text NOT NULL DEFAULT 'warning', + source text NOT NULL DEFAULT 'grafana', + grafana_org_slug text, + service_name text, + labels jsonb NOT NULL DEFAULT '{}'::jsonb, + fingerprint text, + acknowledged_at timestamptz, + acknowledged_by text, + resolved_at timestamptz, + resolved_by text, + created_at timestamptz NOT NULL DEFAULT now(), + updated_at timestamptz NOT NULL DEFAULT now() + ); + """ + ) + op.execute( + """ + CREATE INDEX IF NOT EXISTS irm_alerts_status_created_idx + ON irm_alerts (status, created_at DESC); + """ + ) + op.execute( + """ + CREATE INDEX IF NOT EXISTS irm_alerts_ingress_event_id_idx + ON irm_alerts (ingress_event_id); + """ + ) + op.execute( + """ + CREATE TABLE IF NOT EXISTS incident_alert_links ( + incident_id uuid NOT NULL REFERENCES incidents(id) ON DELETE CASCADE, + alert_id uuid NOT NULL REFERENCES irm_alerts(id) ON DELETE CASCADE, + PRIMARY KEY (incident_id, alert_id) + ); + """ + ) + op.execute( + """ + CREATE INDEX IF NOT EXISTS incident_alert_links_alert_idx + ON incident_alert_links (alert_id); + """ + ) + + +def downgrade() -> None: + op.execute("DROP TABLE IF EXISTS incident_alert_links;") + op.execute("DROP TABLE IF EXISTS irm_alerts;") diff --git a/docs/IRM.md b/docs/IRM.md index 3239175..35d2db3 100644 --- a/docs/IRM.md +++ b/docs/IRM.md @@ -6,7 +6,8 @@ | Область | Назначение | onGuard24 | Grafana / внешнее | |---------|------------|-----------|-------------------| -| **Инциденты** | Учёт сбоев, статусы (open → resolved), связь с алертом | Модуль `incidents`: таблица `incidents`, API, UI, авто-создание из `alert.received` | Contact point **Webhook** → `POST /api/v1/ingress/grafana`; правила алертинга в Grafana | +| **Инциденты** | Учёт сбоев, статусы (open → resolved), связь с алертами | Модуль `incidents`: `incidents`, `incident_alert_links`, API, UI; создание вручную или с `alert_ids` | См. **Алерты**; [IRM_GRAFANA_PARITY.md](IRM_GRAFANA_PARITY.md) | +| **Алерты (IRM)** | Приём, Ack/Resolve, не смешивать с инцидентом | Модуль `alerts`: `irm_alerts`, UI/API, вебхук пишет в одной транзакции с `ingress_events` | Grafana IRM Alert Groups; у нас без группировки/эскалации на уровне алерта | | **Задачи** | Подзадачи по инциденту (разбор, фикс) | Модуль `tasks`: таблица `tasks`, привязка к `incident_id` | Опционально: ссылки из алерта; основная работа в onGuard24 | | **Цепочки эскалаций** | Кого звать и в каком порядке при таймаутах | Модуль `escalations`: таблица `escalation_policies` (JSON `steps`), API/UI заготовка | Маршрутизация уведомлений может дублироваться в Grafana contact points; целевая логика — в onGuard24 | | **Календарь дежурств** | Кто в смене, расписание | Модуль `schedules` (развитие) | Календари/команды — данные в onGuard24; уведомления — через интеграции | @@ -17,11 +18,12 @@ | **Пользователи / права** | RBAC | *Пока нет* | SSO Grafana, сеть за reverse proxy | | **SLO** | Цели по доступности | *Вне скоупа v1* | Grafana SLO / Mimir | -## Поток данных (алерт → инцидент) +## Поток данных (как в Grafana IRM) -1. Grafana срабатывает правило → шлёт JSON на **webhook** onGuard24. -2. Сервис пишет строку в `ingress_events`, публикует **`alert.received`**. -3. Модуль **incidents** подписан на событие и создаёт запись в **`incidents`** с ссылкой на `ingress_event_id`. +1. Grafana срабатывает правило → JSON на **webhook** onGuard24. +2. В одной транзакции: **`ingress_events`** + **`irm_alerts`** (статус `firing`), публикуется **`alert.received`**. +3. Дежурный в модуле **Алерты** читает заголовок, лейблы, **Acknowledge** / **Resolve** — это не создание инцидента. +4. **Инцидент** создаётся отдельно (вручную или из карточки алерта), опционально с привязкой **`alert_ids`**. Авто-инцидент из вебхука только при **`AUTO_INCIDENT_FROM_ALERT=1`** (legacy). ## Что настроить в Grafana (обязательно для приёма алертов) diff --git a/docs/IRM_GRAFANA_PARITY.md b/docs/IRM_GRAFANA_PARITY.md new file mode 100644 index 0000000..4c631d1 --- /dev/null +++ b/docs/IRM_GRAFANA_PARITY.md @@ -0,0 +1,39 @@ +# Сравнение onGuard24 с Grafana IRM (Alerting / Incident) + +Grafana Cloud / IRM даёт **группы алертов**, **Acknowledge / Resolve**, **инциденты**, **команды (teams)**, **эскалационные цепочки**, **расписания дежурств**. Ниже — что уже есть в onGuard24 и что планировать отдельно. + +## Уже есть (после разделения алерт / инцидент) + +| Grafana IRM (идея) | onGuard24 | +|--------------------|-----------| +| Входящие уведомления от интеграции | Webhook `POST /api/v1/ingress/grafana` → `ingress_events` + **`irm_alerts`** | +| Статусы firing / acknowledged / resolved | Поле **`irm_alerts.status`**, UI **Алерты**, API `PATCH …/acknowledge`, `…/resolve` | +| Просмотр labels / сырого payload | Карточка алерта в UI, JSON вебхука | +| Инцидент как отдельная сущность | **`incidents`**, создание вручную или кнопка «Создать инцидент» на алерте; связь **`incident_alert_links`** | +| Эскалации (JSON-шаги) | Модуль **Эскалации** (`escalation_policies`) — без автодвижка по таймерам | +| Контакты / каналы | Модуль **Контакты** | +| Расписания (заглушка) | **Календарь дежурств** — UI-задел | + +## Пока нет (зрелые следующие этапы) + +| Функция Grafana IRM | Заметка | +|---------------------|---------| +| **Teams** с фильтрами и привязкой маршрутов | Нет сущности `team`; алерты не маршрутизируются по команде | +| **Alert groups** (несколько алертов в одной группе с общим ID) | Сейчас **одна строка `irm_alerts` на один webhook**; группировка fingerprint / rule_uid — отдельная задача | +| **Silence / Restart** из UI | Статус `silenced` в БД зарезервирован, логика не подключена | +| **Эскалация по таймеру** (wait 15m → notify next) | Политики есть, **фонового исполнителя** нет | +| **On-call из расписания** в цепочке | Нет связи schedules → escalation executor | +| **Пользователи / «Mine» / назначение** | Нет учётных записей onGuard24 для дежурного; `acknowledged_by` — свободный текст (можно расширить) | +| **Интеграция обратно в Grafana** (resolve в Grafana из IRM) | Не делалось | + +## Переменные окружения + +- **`AUTO_INCIDENT_FROM_ALERT`** — если `1` / `true`, сохраняется старое поведение: **каждый** вебхук ещё и создаёт строку в **`incidents`**. По умолчанию **выключено**: только **`irm_alerts`**. + +## Рекомендуемый поток + +1. Grafana → webhook → **алерт** (`firing`). +2. Дежурный в **Алертах**: прочитал → **Ack** → разобрался → **Resolve** (или сразу Resolve). +3. При необходимости **Создать инцидент** (документирование, задачи, эскалация вручную). + +Так модель ближе к IRM, где **алерт** и **инцидент** разведены. diff --git a/onguard24/__init__.py b/onguard24/__init__.py index 552880b..0c9e4d3 100644 --- a/onguard24/__init__.py +++ b/onguard24/__init__.py @@ -1,3 +1,3 @@ """onGuard24 — модульный монолит (ядро + модули).""" -__version__ = "1.8.0" +__version__ = "1.9.0" diff --git a/onguard24/config.py b/onguard24/config.py index 7c37853..1c225dc 100644 --- a/onguard24/config.py +++ b/onguard24/config.py @@ -34,6 +34,11 @@ class Settings(BaseSettings): forgejo_url: str = Field(default="", validation_alias="FORGEJO_URL") forgejo_token: str = Field(default="", validation_alias="FORGEJO_TOKEN") log_level: str = Field(default="info", validation_alias="LOG_LEVEL") + # Устаревшее: автосоздание инцидента на каждый вебхук (без учёта irm_alerts). По умолчанию выкл. + auto_incident_from_alert: bool = Field( + default=False, + validation_alias="AUTO_INCIDENT_FROM_ALERT", + ) def get_settings() -> Settings: diff --git a/onguard24/ingress/grafana.py b/onguard24/ingress/grafana.py index a001dad..5c69b86 100644 --- a/onguard24/ingress/grafana.py +++ b/onguard24/ingress/grafana.py @@ -9,6 +9,7 @@ from starlette.responses import Response from onguard24.domain.entities import Alert, Severity from onguard24.grafana_sources import sources_by_slug, webhook_authorized +from onguard24.ingress.grafana_payload import extract_alert_row_from_grafana_body logger = logging.getLogger(__name__) router = APIRouter(tags=["ingress"]) @@ -119,19 +120,38 @@ async def _grafana_webhook_impl( logger.warning("ingress: database not configured, event not persisted") return Response(status_code=202) + title_row, sev_row, labels_row, fp_row = extract_alert_row_from_grafana_body(body) async with pool.acquire() as conn: - row = await conn.fetchrow( - """ - INSERT INTO ingress_events (source, body, org_slug, service_name) - VALUES ($1, $2::jsonb, $3, $4) - RETURNING id - """, - "grafana", - json.dumps(body), - stored_org_slug, - service_name, - ) - raw_id = row["id"] if row else None + async with conn.transaction(): + row = await conn.fetchrow( + """ + INSERT INTO ingress_events (source, body, org_slug, service_name) + VALUES ($1, $2::jsonb, $3, $4) + RETURNING id + """, + "grafana", + json.dumps(body), + stored_org_slug, + service_name, + ) + raw_id = row["id"] if row else None + if raw_id is not None: + await conn.execute( + """ + INSERT INTO irm_alerts ( + ingress_event_id, status, title, severity, source, + grafana_org_slug, service_name, labels, fingerprint + ) + VALUES ($1, 'firing', $2, $3, 'grafana', $4, $5, $6::jsonb, $7) + """, + raw_id, + title_row or "—", + sev_row, + stored_org_slug, + service_name, + json.dumps(labels_row), + fp_row, + ) bus = getattr(request.app.state, "event_bus", None) if bus and raw_id is not None: title = str(body.get("title") or body.get("ruleName") or "")[:500] diff --git a/onguard24/ingress/grafana_payload.py b/onguard24/ingress/grafana_payload.py new file mode 100644 index 0000000..a5fc106 --- /dev/null +++ b/onguard24/ingress/grafana_payload.py @@ -0,0 +1,53 @@ +"""Извлечение полей для учёта алерта из тела вебхука Grafana (Unified Alerting).""" + +from __future__ import annotations + +import json +from typing import Any + + +def extract_alert_row_from_grafana_body(body: dict[str, Any]) -> tuple[str, str, dict[str, Any], str | None]: + """ + Возвращает: title, severity (info|warning|critical), labels (dict), fingerprint. + """ + title = str(body.get("title") or body.get("ruleName") or "").strip()[:500] + alerts = body.get("alerts") + labels: dict[str, Any] = {} + fingerprint: str | None = None + sev = "warning" + + if isinstance(alerts, list) and alerts and isinstance(alerts[0], dict): + a0 = alerts[0] + fp = a0.get("fingerprint") + if fp is not None: + fingerprint = str(fp)[:500] + if isinstance(a0.get("labels"), dict): + labels.update(a0["labels"]) + ann = a0.get("annotations") + if isinstance(ann, dict) and not title: + title = str(ann.get("summary") or ann.get("description") or "").strip()[:500] + + cl = body.get("commonLabels") + if isinstance(cl, dict): + for k, v in cl.items(): + labels.setdefault(k, v) + + if not title and isinstance(alerts, list) and alerts and isinstance(alerts[0], dict): + title = str(alerts[0].get("labels", {}).get("alertname") or "").strip()[:500] + + raw_s = None + if isinstance(labels.get("severity"), str): + raw_s = labels["severity"].lower() + elif isinstance(labels.get("priority"), str): + raw_s = labels["priority"].lower() + if raw_s in ("critical", "error", "fatal"): + sev = "critical" + elif raw_s in ("warning", "warn"): + sev = "warning" + elif raw_s in ("info", "informational", "none"): + sev = "info" + + # JSONB: только JSON-совместимые значения + clean_labels = {str(k): v for k, v in labels.items() if isinstance(v, (str, int, float, bool, type(None)))} + + return title, sev, clean_labels, fingerprint diff --git a/onguard24/modules/alerts.py b/onguard24/modules/alerts.py new file mode 100644 index 0000000..bf17afe --- /dev/null +++ b/onguard24/modules/alerts.py @@ -0,0 +1,346 @@ +"""Учёт входящих алертов (отдельно от инцидентов): firing → acknowledged → resolved.""" + +from __future__ import annotations + +import html +import json +import logging +from uuid import UUID + +import asyncpg +from fastapi import APIRouter, Depends, HTTPException, Request +from fastapi.responses import HTMLResponse +from pydantic import BaseModel, Field + +from onguard24.deps import get_pool +from onguard24.domain.events import EventBus +from onguard24.modules.ui_support import wrap_module_html_page + +log = logging.getLogger(__name__) + +router = APIRouter(tags=["module-alerts"]) +ui_router = APIRouter(tags=["web-alerts"], include_in_schema=False) + +_VALID_STATUS = frozenset({"firing", "acknowledged", "resolved", "silenced"}) + + +def register_events(_bus: EventBus, _pool: asyncpg.Pool | None = None) -> None: + pass + + +class AckBody(BaseModel): + by_user: str | None = Field(default=None, max_length=200, description="Кто подтвердил") + + +class ResolveBody(BaseModel): + by_user: str | None = Field(default=None, max_length=200) + + +def _row_to_item(r: asyncpg.Record) -> dict: + return { + "id": str(r["id"]), + "ingress_event_id": str(r["ingress_event_id"]), + "status": r["status"], + "title": r["title"], + "severity": r["severity"], + "source": r["source"], + "grafana_org_slug": r["grafana_org_slug"], + "service_name": r["service_name"], + "labels": r["labels"] if isinstance(r["labels"], dict) else {}, + "fingerprint": r["fingerprint"], + "acknowledged_at": r["acknowledged_at"].isoformat() if r["acknowledged_at"] else None, + "acknowledged_by": r["acknowledged_by"], + "resolved_at": r["resolved_at"].isoformat() if r["resolved_at"] else None, + "resolved_by": r["resolved_by"], + "created_at": r["created_at"].isoformat() if r["created_at"] else None, + "updated_at": r["updated_at"].isoformat() if r["updated_at"] else None, + } + + +@router.get("/") +async def list_alerts_api( + pool: asyncpg.Pool | None = Depends(get_pool), + status: str | None = None, + limit: int = 100, +): + if pool is None: + return {"items": [], "database": "disabled"} + limit = min(max(limit, 1), 200) + st = (status or "").strip().lower() + if st and st not in _VALID_STATUS: + raise HTTPException(status_code=400, detail="invalid status filter") + async with pool.acquire() as conn: + if st: + rows = await conn.fetch( + """ + SELECT * FROM irm_alerts WHERE status = $1 + ORDER BY created_at DESC LIMIT $2 + """, + st, + limit, + ) + else: + rows = await conn.fetch( + """ + SELECT * FROM irm_alerts + ORDER BY created_at DESC LIMIT $1 + """, + limit, + ) + return {"items": [_row_to_item(r) for r in rows]} + + +@router.get("/{alert_id}") +async def get_alert_api(alert_id: UUID, pool: asyncpg.Pool | None = Depends(get_pool)): + if pool is None: + raise HTTPException(status_code=503, detail="database disabled") + async with pool.acquire() as conn: + row = await conn.fetchrow("SELECT * FROM irm_alerts WHERE id = $1::uuid", alert_id) + raw = None + if row and row.get("ingress_event_id"): + raw = await conn.fetchrow( + "SELECT id, body, received_at FROM ingress_events WHERE id = $1::uuid", + row["ingress_event_id"], + ) + if not row: + raise HTTPException(status_code=404, detail="not found") + out = _row_to_item(row) + if raw: + out["raw_received_at"] = raw["received_at"].isoformat() if raw["received_at"] else None + body = raw["body"] + out["raw_body"] = dict(body) if hasattr(body, "keys") else body + else: + out["raw_received_at"] = None + out["raw_body"] = None + return out + + +@router.patch("/{alert_id}/acknowledge", status_code=200) +async def acknowledge_alert_api( + alert_id: UUID, + body: AckBody, + pool: asyncpg.Pool | None = Depends(get_pool), +): + if pool is None: + raise HTTPException(status_code=503, detail="database disabled") + who = (body.by_user or "").strip() or None + async with pool.acquire() as conn: + row = await conn.fetchrow( + """ + UPDATE irm_alerts SET + status = 'acknowledged', + acknowledged_at = now(), + acknowledged_by = COALESCE($2, acknowledged_by), + updated_at = now() + WHERE id = $1::uuid AND status = 'firing' + RETURNING * + """, + alert_id, + who, + ) + if not row: + raise HTTPException(status_code=409, detail="alert not in firing state or not found") + return _row_to_item(row) + + +@router.patch("/{alert_id}/resolve", status_code=200) +async def resolve_alert_api( + alert_id: UUID, + body: ResolveBody, + pool: asyncpg.Pool | None = Depends(get_pool), +): + if pool is None: + raise HTTPException(status_code=503, detail="database disabled") + who = (body.by_user or "").strip() or None + async with pool.acquire() as conn: + row = await conn.fetchrow( + """ + UPDATE irm_alerts SET + status = 'resolved', + resolved_at = now(), + resolved_by = COALESCE($2, resolved_by), + updated_at = now() + WHERE id = $1::uuid AND status IN ('firing', 'acknowledged') + RETURNING * + """, + alert_id, + who, + ) + if not row: + raise HTTPException( + status_code=409, + detail="alert cannot be resolved from current state or not found", + ) + return _row_to_item(row) + + +_SYNC_BTN_STYLE = """ + +""" + + +@ui_router.get("/", response_class=HTMLResponse) +async def alerts_ui_list(request: Request): + pool = get_pool(request) + body = "" + if pool is None: + body = "

База не настроена.

" + else: + try: + async with pool.acquire() as conn: + rows = await conn.fetch( + """ + SELECT id, status, title, severity, grafana_org_slug, service_name, created_at, fingerprint + FROM irm_alerts + ORDER BY created_at DESC + LIMIT 150 + """ + ) + if not rows: + body = "

Пока нет алертов. События появляются после вебхука Grafana.

" + else: + trs = [] + for r in rows: + aid = str(r["id"]) + trs.append( + "" + f"{html.escape(r['status'])}" + f"" + f"{html.escape(aid[:8])}…" + f"{html.escape((r['title'] or '—')[:200])}" + f"{html.escape(r['severity'])}" + f"{html.escape(str(r['grafana_org_slug'] or '—'))}" + f"{html.escape(str(r['service_name'] or '—'))}" + f"{html.escape(r['created_at'].isoformat() if r['created_at'] else '—')}" + "" + ) + body = ( + "

Алерт — запись о входящем уведомлении. " + "Инцидент создаётся вручную (из карточки алерта или раздела «Инциденты») " + "и может ссылаться на один или несколько алертов.

" + "" + "" + + "".join(trs) + + "
СтатусIDЗаголовокВажностьGrafana slugСервисСоздан
" + ) + except Exception as e: + body = f"

{html.escape(str(e))}

" + page = f"

Алерты

{body}{_SYNC_BTN_STYLE}" + return HTMLResponse( + wrap_module_html_page( + document_title="Алерты — onGuard24", + current_slug="alerts", + main_inner_html=page, + ) + ) + + +@ui_router.get("/{alert_id:uuid}", response_class=HTMLResponse) +async def alerts_ui_detail(request: Request, alert_id: UUID): + pool = get_pool(request) + if pool is None: + return HTMLResponse( + wrap_module_html_page( + document_title="Алерт — onGuard24", + current_slug="alerts", + main_inner_html="

Алерт

База не настроена.

", + ) + ) + try: + async with pool.acquire() as conn: + row = await conn.fetchrow("SELECT * FROM irm_alerts WHERE id = $1::uuid", alert_id) + raw = None + if row and row.get("ingress_event_id"): + raw = await conn.fetchrow( + "SELECT body, received_at FROM ingress_events WHERE id = $1::uuid", + row["ingress_event_id"], + ) + except Exception as e: + return HTMLResponse( + wrap_module_html_page( + document_title="Алерт — onGuard24", + current_slug="alerts", + main_inner_html=f"

Алерт

{html.escape(str(e))}

", + ) + ) + if not row: + inner = "

Не найдено.

" + else: + aid = str(row["id"]) + st = row["status"] + title_js = json.dumps(row["title"] or "") + btns = [] + if st == "firing": + btns.append( + f"" + ) + if st in ("firing", "acknowledged"): + btns.append( + f"" + ) + btns.append( + f"" + ) + lab = row["labels"] + lab_s = json.dumps(dict(lab), ensure_ascii=False, indent=2) if isinstance(lab, dict) else "{}" + raw_pre = "" + if raw: + b = raw["body"] + pretty = json.dumps(dict(b), ensure_ascii=False, indent=2) if hasattr(b, "keys") else str(b) + if len(pretty) > 14000: + pretty = pretty[:14000] + "\n…" + raw_pre = ( + "

Полное тело вебхука

" + f"
"
+                f"{html.escape(pretty)}
" + ) + inner = ( + f"

← К списку алертов

" + f"

Алерт

{''.join(btns)}
" + f"
" + f"
ID
{html.escape(aid)}
" + f"
Статус
{html.escape(st)}
" + f"
Заголовок
{html.escape(row['title'] or '—')}
" + f"
Важность
{html.escape(row['severity'])}
" + f"
Grafana slug
{html.escape(str(row['grafana_org_slug'] or '—'))}
" + f"
Сервис
{html.escape(str(row['service_name'] or '—'))}
" + f"
Fingerprint
{html.escape(str(row['fingerprint'] or '—'))}
" + f"
Labels
{html.escape(lab_s)}
" + f"
{raw_pre}" + ) + page = f"{inner}{_SYNC_BTN_STYLE}" + return HTMLResponse( + wrap_module_html_page( + document_title="Алерт — onGuard24", + current_slug="alerts", + main_inner_html=page, + ) + ) + + +async def render_home_fragment(request: Request) -> str: + pool = get_pool(request) + if pool is None: + return '

Нужна БД для учёта алертов.

' + try: + async with pool.acquire() as conn: + n = await conn.fetchval("SELECT count(*)::int FROM irm_alerts") + nf = await conn.fetchval( + "SELECT count(*)::int FROM irm_alerts WHERE status = 'firing'" + ) + except Exception: + return '

Таблица алертов недоступна (миграция 005?).

' + return ( + f'

Алертов в учёте: {int(n)} ' + f'({int(nf)} firing). ' + f'Открыть

' + ) diff --git a/onguard24/modules/incidents.py b/onguard24/modules/incidents.py index cd65b25..38bc217 100644 --- a/onguard24/modules/incidents.py +++ b/onguard24/modules/incidents.py @@ -12,6 +12,7 @@ from fastapi import APIRouter, Depends, HTTPException, Request from fastapi.responses import HTMLResponse from pydantic import BaseModel, Field +from onguard24.config import get_settings from onguard24.deps import get_pool from onguard24.domain.events import AlertReceived, DomainEvent, EventBus from onguard24.modules.ui_support import wrap_module_html_page @@ -26,6 +27,7 @@ class IncidentCreate(BaseModel): title: str = Field(..., min_length=1, max_length=500) status: str = Field(default="open", max_length=64) severity: str = Field(default="warning", max_length=32) + alert_ids: list[UUID] = Field(default_factory=list, description="Привязка к irm_alerts") class IncidentPatch(BaseModel): @@ -39,6 +41,8 @@ def register_events(bus: EventBus, pool: asyncpg.Pool | None = None) -> None: return async def on_alert(ev: DomainEvent) -> None: + if not get_settings().auto_incident_from_alert: + return if not isinstance(ev, AlertReceived) or ev.raw_payload_ref is None: return a = ev.alert @@ -136,17 +140,29 @@ async def create_incident_api( if pool is None: raise HTTPException(status_code=503, detail="database disabled") async with pool.acquire() as conn: - row = await conn.fetchrow( - """ - INSERT INTO incidents (title, status, severity, source, grafana_org_slug, service_name) - VALUES ($1, $2, $3, 'manual', NULL, NULL) - RETURNING id, title, status, severity, source, ingress_event_id, created_at, updated_at, - grafana_org_slug, service_name - """, - body.title.strip(), - body.status, - body.severity, - ) + async with conn.transaction(): + row = await conn.fetchrow( + """ + INSERT INTO incidents (title, status, severity, source, grafana_org_slug, service_name) + VALUES ($1, $2, $3, 'manual', NULL, NULL) + RETURNING id, title, status, severity, source, ingress_event_id, created_at, updated_at, + grafana_org_slug, service_name + """, + body.title.strip(), + body.status, + body.severity, + ) + iid = row["id"] + for aid in body.alert_ids[:50]: + await conn.execute( + """ + INSERT INTO incident_alert_links (incident_id, alert_id) + VALUES ($1::uuid, $2::uuid) + ON CONFLICT DO NOTHING + """, + iid, + aid, + ) return { "id": str(row["id"]), "title": row["title"], @@ -312,7 +328,7 @@ async def incidents_ui_home(request: Request): IDЗаголовокСтатусВажностьИсточникGrafana slugСервисСоздан {rows_html or 'Пока нет записей'} -

Создание из Grafana: webhook → ingress_events → событие → строка здесь. Пустой заголовок бывает при тестовом JSON без полей алерта.

""" +

Сначала вебхук создаёт алерт (учёт, Ack/Resolve). Инцидент — отдельная сущность: создаётся вручную или из карточки алерта, к нему можно привязать один или несколько алертов. Пустой заголовок в списке — часто тестовый JSON без полей правила.

""" return HTMLResponse( wrap_module_html_page( document_title="Инциденты — onGuard24", diff --git a/onguard24/modules/registry.py b/onguard24/modules/registry.py index 16827d2..d564965 100644 --- a/onguard24/modules/registry.py +++ b/onguard24/modules/registry.py @@ -14,6 +14,7 @@ from starlette.requests import Request from onguard24.domain.events import EventBus from onguard24.modules import ( + alerts, contacts, escalations, grafana_catalog, @@ -52,6 +53,15 @@ def _mounts() -> list[ModuleMount]: ui_router=grafana_catalog.ui_router, render_home_fragment=grafana_catalog.render_home_fragment, ), + ModuleMount( + router=alerts.router, + url_prefix="/api/v1/modules/alerts", + register_events=alerts.register_events, + slug="alerts", + title="Алерты", + ui_router=alerts.ui_router, + render_home_fragment=alerts.render_home_fragment, + ), ModuleMount( router=incidents.router, url_prefix="/api/v1/modules/incidents", diff --git a/pyproject.toml b/pyproject.toml index 7ff850c..865598f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "onguard24" -version = "1.8.0" +version = "1.9.0" description = "onGuard24 — модульный сервис (аналог IRM)" readme = "README.md" requires-python = ">=3.11" diff --git a/tests/irm_db_fake.py b/tests/irm_db_fake.py index 5d22c1a..a6ba03e 100644 --- a/tests/irm_db_fake.py +++ b/tests/irm_db_fake.py @@ -25,15 +25,28 @@ class Row: return self._data.get(key, default) +class _FakeTxn: + async def __aenter__(self) -> None: + return None + + async def __aexit__(self, *args: Any) -> None: + return None + + class IrmFakeConn: def __init__(self, store: IrmFakeStore) -> None: self.store = store + def transaction(self) -> _FakeTxn: + return _FakeTxn() + def _q(self, query: str) -> str: return " ".join(query.split()) async def execute(self, query: str, *args: Any) -> str: q = self._q(query) + if "INSERT INTO incident_alert_links" in q: + return "INSERT 0 1" if "INSERT INTO incidents" in q and "ingress_event_id" in q: self.store.insert_incident_alert( args[0], args[1], args[2], args[3], args[4] diff --git a/tests/test_alerts_api.py b/tests/test_alerts_api.py new file mode 100644 index 0000000..f053ac2 --- /dev/null +++ b/tests/test_alerts_api.py @@ -0,0 +1,9 @@ +"""API модуля алертов без БД.""" + +from fastapi.testclient import TestClient + + +def test_alerts_list_no_db(client: TestClient) -> None: + r = client.get("/api/v1/modules/alerts/") + assert r.status_code == 200 + assert r.json() == {"items": [], "database": "disabled"} diff --git a/tests/test_grafana_payload.py b/tests/test_grafana_payload.py new file mode 100644 index 0000000..58faba4 --- /dev/null +++ b/tests/test_grafana_payload.py @@ -0,0 +1,26 @@ +"""Парсинг полей из тела вебхука Grafana.""" + +from onguard24.ingress.grafana_payload import extract_alert_row_from_grafana_body + + +def test_extract_title_and_severity_from_unified() -> None: + body = { + "title": "RuleName", + "alerts": [ + { + "labels": {"severity": "critical", "alertname": "X"}, + "fingerprint": "abc", + } + ], + } + title, sev, labels, fp = extract_alert_row_from_grafana_body(body) + assert title == "RuleName" + assert sev == "critical" + assert labels.get("alertname") == "X" + assert fp == "abc" + + +def test_extract_empty_title_uses_alertname() -> None: + body = {"alerts": [{"labels": {"alertname": "HostDown"}}]} + title, _, _, _ = extract_alert_row_from_grafana_body(body) + assert title == "HostDown" diff --git a/tests/test_ingress.py b/tests/test_ingress.py index 2e402f2..c5b9cba 100644 --- a/tests/test_ingress.py +++ b/tests/test_ingress.py @@ -1,9 +1,23 @@ import json from unittest.mock import AsyncMock, MagicMock, patch - from fastapi.testclient import TestClient +def _webhook_mock_pool(mock_conn: AsyncMock) -> MagicMock: + """Пул с транзакцией и execute — как после вставки ingress + irm_alerts.""" + tx = AsyncMock() + tx.__aenter__ = AsyncMock(return_value=None) + tx.__aexit__ = AsyncMock(return_value=None) + mock_conn.transaction = MagicMock(return_value=tx) + mock_conn.execute = AsyncMock() + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_conn) + mock_cm.__aexit__ = AsyncMock(return_value=None) + mock_pool = MagicMock() + mock_pool.acquire = MagicMock(return_value=mock_cm) + return mock_pool + + def test_grafana_webhook_no_db(client: TestClient) -> None: """Без пула БД — 202, запись не падает.""" r = client.post( @@ -41,12 +55,7 @@ def test_grafana_webhook_inserts_with_mock_pool(client: TestClient) -> None: mock_conn = AsyncMock() uid = uuid4() mock_conn.fetchrow = AsyncMock(return_value={"id": uid}) - mock_cm = AsyncMock() - mock_cm.__aenter__ = AsyncMock(return_value=mock_conn) - mock_cm.__aexit__ = AsyncMock(return_value=None) - - mock_pool = MagicMock() - mock_pool.acquire = MagicMock(return_value=mock_cm) + mock_pool = _webhook_mock_pool(mock_conn) app = client.app real_pool = app.state.pool @@ -59,6 +68,7 @@ def test_grafana_webhook_inserts_with_mock_pool(client: TestClient) -> None: ) assert r.status_code == 202 mock_conn.fetchrow.assert_called_once() + mock_conn.execute.assert_called_once() finally: app.state.pool = real_pool @@ -69,11 +79,7 @@ def test_grafana_webhook_auto_org_from_external_url(client: TestClient) -> None: mock_conn = AsyncMock() uid = uuid4() mock_conn.fetchrow = AsyncMock(return_value={"id": uid}) - mock_cm = AsyncMock() - mock_cm.__aenter__ = AsyncMock(return_value=mock_conn) - mock_cm.__aexit__ = AsyncMock(return_value=None) - mock_pool = MagicMock() - mock_pool.acquire = MagicMock(return_value=mock_cm) + mock_pool = _webhook_mock_pool(mock_conn) app = client.app real_pool = app.state.pool @@ -99,11 +105,7 @@ def test_grafana_webhook_publishes_alert_received(client: TestClient) -> None: mock_conn = AsyncMock() uid = uuid4() mock_conn.fetchrow = AsyncMock(return_value={"id": uid}) - mock_cm = AsyncMock() - mock_cm.__aenter__ = AsyncMock(return_value=mock_conn) - mock_cm.__aexit__ = AsyncMock(return_value=None) - mock_pool = MagicMock() - mock_pool.acquire = MagicMock(return_value=mock_cm) + mock_pool = _webhook_mock_pool(mock_conn) app = client.app bus = app.state.event_bus @@ -130,11 +132,7 @@ def test_grafana_webhook_org_any_slug_without_json_config(client: TestClient) -> mock_conn = AsyncMock() uid = uuid4() mock_conn.fetchrow = AsyncMock(return_value={"id": uid}) - mock_cm = AsyncMock() - mock_cm.__aenter__ = AsyncMock(return_value=mock_conn) - mock_cm.__aexit__ = AsyncMock(return_value=None) - mock_pool = MagicMock() - mock_pool.acquire = MagicMock(return_value=mock_cm) + mock_pool = _webhook_mock_pool(mock_conn) app = client.app real_pool = app.state.pool @@ -157,11 +155,7 @@ def test_grafana_webhook_org_ok(client: TestClient) -> None: mock_conn = AsyncMock() uid = uuid4() mock_conn.fetchrow = AsyncMock(return_value={"id": uid}) - mock_cm = AsyncMock() - mock_cm.__aenter__ = AsyncMock(return_value=mock_conn) - mock_cm.__aexit__ = AsyncMock(return_value=None) - mock_pool = MagicMock() - mock_pool.acquire = MagicMock(return_value=mock_cm) + mock_pool = _webhook_mock_pool(mock_conn) app = client.app real_json = app.state.settings.grafana_sources_json diff --git a/tests/test_irm_modules.py b/tests/test_irm_modules.py index 5dadf78..554105c 100644 --- a/tests/test_irm_modules.py +++ b/tests/test_irm_modules.py @@ -29,8 +29,41 @@ def test_escalations_api_list_no_db(client: TestClient) -> None: @pytest.mark.asyncio -async def test_incident_inserted_on_alert_received() -> None: - """При пуле БД подписка создаёт инцидент (INSERT).""" +async def test_incident_not_created_from_alert_by_default() -> None: + """По умолчанию AUTO_INCIDENT_FROM_ALERT выкл — инцидент из вебхука не создаётся.""" + calls: list = [] + + async def fake_execute(_query, *args): + calls.append(args) + return "INSERT 0 1" + + mock_conn = AsyncMock() + mock_conn.execute = fake_execute + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_conn) + mock_cm.__aexit__ = AsyncMock(return_value=None) + mock_pool = MagicMock() + mock_pool.acquire = MagicMock(return_value=mock_cm) + + from onguard24.domain.events import InMemoryEventBus + from onguard24.modules import incidents as inc_mod + + bus = InMemoryEventBus() + inc_mod.register_events(bus, mock_pool) + + uid = uuid4() + ev = AlertReceived( + alert=Alert(source="grafana", title="CPU high", severity=Severity.WARNING), + raw_payload_ref=uid, + ) + await bus.publish(ev) + assert calls == [] + + +@pytest.mark.asyncio +async def test_incident_inserted_on_alert_when_auto_enabled(monkeypatch: pytest.MonkeyPatch) -> None: + """При AUTO_INCIDENT_FROM_ALERT=1 подписка снова создаёт инцидент (legacy).""" + monkeypatch.setenv("AUTO_INCIDENT_FROM_ALERT", "1") inserted: dict = {} async def fake_execute(_query, *args): diff --git a/tests/test_root_ui.py b/tests/test_root_ui.py index 957e347..ed2887c 100644 --- a/tests/test_root_ui.py +++ b/tests/test_root_ui.py @@ -33,6 +33,7 @@ def test_rail_lists_all_registered_ui_modules(client: TestClient) -> None: t = r.text expected = ( ("grafana-catalog", "Каталог Grafana"), + ("alerts", "Алерты"), ("incidents", "Инциденты"), ("tasks", "Задачи"), ("escalations", "Эскалации"), @@ -49,6 +50,7 @@ def test_each_module_page_single_active_nav_item(client: TestClient) -> None: """На странице модуля ровно один пункт с aria-current (текущий раздел).""" for slug in ( "grafana-catalog", + "alerts", "incidents", "tasks", "escalations",