Files
vf-cuda-grid/controller/cuda_grid_controller/__main__.py
T
gx d29f3f96e5 pipeline_monitor: + stall watchdog (mediamtx bytes-based detect)
Resilience improvement — раньше pipeline mог hung без exit (NVENC stuck,
output broken pipe), Docker restart policy не triggered. Никакой alert.

Now: poll mediamtx /v3/rtspsessions/list каждые N sec, track publish session
inboundBytes. Не растёт 3 polls (~9 sec) → emit MQTT 'pipeline_stalled' event
(через dispatcher.on_event = mqtt.publish_event). User / Home Assistant
automation решает что делать (restart container, notify).

Wired:
  pipeline_monitor.on_event = mqtt.publish_event  # __main__.py

Bytes started growing again → emit 'pipeline_unstalled'.

Alert single-shot: пока stalled flag set, no dup alerts. Reset когда
bytes counter растёт.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-25 10:03:41 +01:00

197 lines
6.2 KiB
Python

"""Entry point: `cuda-grid-controller --config controller.yaml`."""
from __future__ import annotations
import asyncio
import logging
import sys
from pathlib import Path
import structlog
import typer
import uvicorn
from .config import Config
from .dispatch import CommandDispatcher
from .browser_overlays import BrowserRenderer, DashboardCfg
from .dynamic_overlays import ChartCfg, ChatCfg, DynamicRenderer
from .pipeline_monitor import PipelineMonitor
from .frigate_bridge import FrigateBridge, FrigateBridgeCfg
from .http_api import create_app
from .mqtt_loop import MqttLoop
from .snapshot_history import SnapshotHistory
from .state import ControllerState
from .watchdog import StreamWatchdog, WatchdogCfg
cli = typer.Typer(add_completion=False)
def _configure_logging(level: str) -> None:
logging.basicConfig(
format="%(message)s",
level=getattr(logging, level.upper(), logging.INFO),
)
structlog.configure(
processors=[
structlog.processors.add_log_level,
structlog.processors.TimeStamper(fmt="iso"),
structlog.dev.ConsoleRenderer(),
]
)
async def _run(cfg: Config) -> None:
state = ControllerState()
# Init active_layout = default_layout per instance
for inst in cfg.instances:
await state.set_layout(inst.name, inst.default_layout)
dispatcher = CommandDispatcher(cfg, state)
# Frigate bridge (опционально) — передаём dispatcher для auto-overlay generation
frigate_bridge: FrigateBridge | None = None
if cfg.frigate:
try:
fcfg = FrigateBridgeCfg.model_validate(cfg.frigate)
if fcfg.enabled:
frigate_bridge = FrigateBridge(fcfg, dispatcher=dispatcher)
except Exception as e:
structlog.get_logger().warning(
"frigate_bridge.config_invalid", error=str(e)
)
# Dynamic overlays (charts/chats) — Phase 6
dynamic_renderer: DynamicRenderer | None = None
browser_renderer: BrowserRenderer | None = None
if cfg.dynamic_overlays:
try:
d = cfg.dynamic_overlays
charts = [ChartCfg.model_validate(c) for c in (d.get("charts") or [])]
chats = [ChatCfg.model_validate(c) for c in (d.get("chats") or [])]
if charts or chats:
dynamic_renderer = DynamicRenderer(
icon_dir=Path(cfg.icon_dir),
dispatcher=dispatcher,
charts=charts,
chats=chats,
)
dashboards = [DashboardCfg.model_validate(b)
for b in (d.get("dashboards") or [])]
if dashboards:
browser_renderer = BrowserRenderer(
icon_dir=Path(cfg.icon_dir),
dispatcher=dispatcher,
dashboards=dashboards,
)
except Exception as e:
structlog.get_logger().warning("dynamic_overlays.config_invalid", error=str(e))
mqtt = MqttLoop(cfg, state, dispatcher.handle,
frigate_bridge=frigate_bridge,
dynamic_renderer=dynamic_renderer)
# Wire dispatcher events → MQTT publishes
dispatcher.on_state_change = mqtt.publish_state
dispatcher.on_event = mqtt.publish_event
# Snapshot history (Phase 6+) — periodic capture per instance
snapshot_hist = SnapshotHistory(cfg)
# Stream watchdog (Phase 1 resilience, issue #3) — monitor mediamtx paths
watchdog: StreamWatchdog | None = None
if cfg.watchdog:
try:
wcfg = WatchdogCfg.model_validate(cfg.watchdog)
if wcfg.enabled:
watchdog = StreamWatchdog(wcfg, dispatcher,
mqtt_publish_event=None) # set after mqtt
except Exception as e:
structlog.get_logger().warning("watchdog.config_invalid", error=str(e))
# HTTP REST
app = create_app(cfg, state, dispatcher,
snapshot_history=snapshot_hist,
frigate_bridge=frigate_bridge)
server = uvicorn.Server(
uvicorn.Config(
app,
host=cfg.http.host,
port=cfg.http.port,
log_level=cfg.log.level.lower(),
)
)
log = structlog.get_logger()
log.info(
"controller.starting",
instances=[i.name for i in cfg.instances],
mqtt=f"{cfg.broker.host}:{cfg.broker.port}",
http=f"{cfg.http.host}:{cfg.http.port}",
)
# Start dynamic renderer задачи (если есть)
if dynamic_renderer:
await dynamic_renderer.start()
if browser_renderer:
await browser_renderer.start()
await snapshot_hist.start()
if watchdog:
watchdog._publish_event = mqtt.publish_event
await watchdog.start()
# Pipeline monitor — detect ffmpeg restart + auto-restore overlay state +
# encoder stall alerts (через MQTT pipeline.stalled event).
pipeline_monitor = PipelineMonitor(
cfg=cfg, state=state, dispatcher=dispatcher,
browser_renderer=browser_renderer,
dynamic_renderer=dynamic_renderer,
frigate_bridge=frigate_bridge,
)
pipeline_monitor.on_event = mqtt.publish_event
await pipeline_monitor.start()
try:
await asyncio.gather(
mqtt.run(),
server.serve(),
)
except asyncio.CancelledError:
log.info("controller.shutdown")
finally:
await pipeline_monitor.stop()
if dynamic_renderer:
await dynamic_renderer.stop()
if browser_renderer:
await browser_renderer.stop()
await snapshot_hist.stop()
if watchdog:
await watchdog.stop()
await dispatcher.close()
await mqtt.stop()
@cli.command()
def run(
config: Path = typer.Option(
Path("controller.yaml"),
"--config",
"-c",
help="YAML config path",
),
) -> None:
"""Запустить controller."""
if not config.exists():
typer.echo(f"config not found: {config}", err=True)
raise typer.Exit(1)
cfg = Config.from_yaml(config)
_configure_logging(cfg.log.level)
asyncio.run(_run(cfg))
def main() -> None:
cli()
if __name__ == "__main__":
main()