v0.3.1: per-subscriber monitor thread — fix bitmap leak

Bug: handshake_subscriber assigned bit + activated slot но НЕ tracked client_fd. Когда subscriber container exited, socket closed on client side но producer не detected → bit оставался set forever → после 32 connections subscribe_create('cam-X'): too many subscribers (max 32). Симптом в production: каждый pipeline recreate accumulated 1 stale subscriber. После 4-5 recreate операций publishers перестали accept new pipeline → "too many subscribers" crash loop. Fix: после успешного handshake spawn detached pthread monitoring socket via blocking recv(). recv() returns 0 (EOF) когда other side closes — monitor clears bit (subscriber_bitmap &= ~(1<<bit)) + state[bit] = 0, closes fd, exits. Cost: 1 thread per active subscriber. Max 32 threads — небольшой overhead. Threads detached, no join needed. Stress test: 5x pipeline recreate без single "too many subscribers" error. Раньше: 2-3 recreate → bitmap overflow. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
v0.3: per-slot CUDA events — закрывает TOCTOU race без crutches
2026-05-24 08:00:41 +01:00 · 2026-05-22 09:23:53 +01:00 · 2026-05-21 22:27:39 +01:00 · 2026-05-19 20:39:47 +01:00 · 2026-05-19 19:22:53 +01:00 · 2026-05-19 19:07:16 +01:00
36 changed files with 4305 additions and 262 deletions
@@ -0,0 +1,181 @@
+name: build
+
+on:
+  push:
+    branches: [main]
+    paths-ignore:
+      - '**.md'
+      - 'docs/**'
+      - 'BENCHMARKS.md'
+      - 'ROADMAP.md'
+      - 'CHANGELOG.md'
+      - 'LICENSE'
+      - '.gitea/ISSUE_TEMPLATE/**'
+  pull_request:
+    branches: [main]
+
+jobs:
+  cmake-build:
+    name: cmake build (CUDA 12.4, Ubuntu 22.04)
+    runs-on: ubuntu-22.04
+    container:
+      image: nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
+    steps:
+      # actions/checkout@v4 требует Node 20+. Ubuntu 22.04 apt даёт Node 12 — не подходит.
+      # Ставим Node 20 из NodeSource repo.
+      - name: Bootstrap Node 20 + git (для actions/checkout)
+        run: |
+          set -e
+          export DEBIAN_FRONTEND=noninteractive
+          apt-get update
+          apt-get install -y --no-install-recommends curl git ca-certificates gnupg
+          # NodeSource setup может молча упасть на slow networks (особенно через VPN
+          # на u4-runner); retry + явная verification что Node >= 18 после install.
+          for i in 1 2 3; do
+            if curl -fsSL --retry 3 --retry-delay 5 --connect-timeout 30 \
+                 https://deb.nodesource.com/setup_20.x | bash -; then
+              break
+            fi
+            echo "NodeSource setup attempt $i failed, retrying..."
+            sleep 10
+          done
+          apt-get install -y --no-install-recommends nodejs
+          NODE_VER=$(node --version)
+          echo "node: $NODE_VER"
+          # actions/checkout@v4 требует Node 20+ (ES2022 static blocks).
+          # Если NodeSource setup упал и установился Ubuntu's Node 12 — фейлим явно.
+          NODE_MAJOR=$(echo "$NODE_VER" | sed -E 's/^v([0-9]+).*/\1/')
+          if [ "$NODE_MAJOR" -lt 18 ]; then
+            echo "ERROR: Node $NODE_VER too old, NodeSource setup likely failed" >&2
+            exit 1
+          fi
+
+      - name: Install build deps
+        run: |
+          export DEBIAN_FRONTEND=noninteractive
+          apt-get install -y --no-install-recommends \
+              build-essential cmake ninja-build pkg-config \
+              libavformat-dev libavcodec-dev libavutil-dev libswscale-dev
+
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Configure (full — libcuframes + examples + tools)
+        run: |
+          cmake -B build -S . -G Ninja \
+              -DCMAKE_BUILD_TYPE=Release \
+              -DBUILD_TESTING=OFF \
+              -DBUILD_EXAMPLES=ON \
+              -DBUILD_TOOLS=ON \
+              -DBUILD_FFMPEG_FILTER=OFF \
+              -DBUILD_PYTHON_BINDINGS=OFF
+
+      - name: Build
+        run: cmake --build build --parallel
+
+      - name: Verify produced binaries + library
+        run: |
+          ls -la build/libcuframes/libcuframes.so*
+          ls -la build/libcuframes/libcuframes_static.a
+          ls -la build/tools/cuframes-rtsp-source/cuframes-rtsp-source
+          ls -la build/examples/sub_count/sub_count
+          ./build/tools/cuframes-rtsp-source/cuframes-rtsp-source --help | head -5
+
+      - name: Install + verify install layout
+        run: |
+          cmake --install build --prefix /tmp/cuframes-install
+          test -f /tmp/cuframes-install/include/cuframes/cuframes.h
+          test -f /tmp/cuframes-install/include/cuframes/cuframes.hpp
+          test -f /tmp/cuframes-install/lib/libcuframes.so
+          test -f /tmp/cuframes-install/lib/libcuframes_static.a
+
+  filter-build:
+    name: ffmpeg filter patch (out-of-tree)
+    runs-on: ubuntu-22.04
+    container:
+      image: nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
+    needs: cmake-build
+    steps:
+      - name: Bootstrap Node 20 + git (для actions/checkout)
+        run: |
+          set -e
+          export DEBIAN_FRONTEND=noninteractive
+          apt-get update
+          apt-get install -y --no-install-recommends curl git ca-certificates gnupg
+          # NodeSource setup может молча упасть на slow networks (особенно через VPN
+          # на u4-runner); retry + явная verification что Node >= 18 после install.
+          for i in 1 2 3; do
+            if curl -fsSL --retry 3 --retry-delay 5 --connect-timeout 30 \
+                 https://deb.nodesource.com/setup_20.x | bash -; then
+              break
+            fi
+            echo "NodeSource setup attempt $i failed, retrying..."
+            sleep 10
+          done
+          apt-get install -y --no-install-recommends nodejs
+          NODE_VER=$(node --version)
+          echo "node: $NODE_VER"
+          # actions/checkout@v4 требует Node 20+ (ES2022 static blocks).
+          # Если NodeSource setup упал и установился Ubuntu's Node 12 — фейлим явно.
+          NODE_MAJOR=$(echo "$NODE_VER" | sed -E 's/^v([0-9]+).*/\1/')
+          if [ "$NODE_MAJOR" -lt 18 ]; then
+            echo "ERROR: Node $NODE_VER too old, NodeSource setup likely failed" >&2
+            exit 1
+          fi
+
+      - name: Install build deps
+        run: |
+          export DEBIAN_FRONTEND=noninteractive
+          apt-get install -y --no-install-recommends \
+              build-essential cmake ninja-build pkg-config nasm \
+              libssl-dev libx264-dev libx265-dev libnuma-dev zlib1g-dev \
+              wget patch
+
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Build libcuframes (для linking в patched ffmpeg)
+        run: |
+          cmake -B build -S . -G Ninja \
+              -DCMAKE_BUILD_TYPE=Release \
+              -DBUILD_TESTING=OFF -DBUILD_EXAMPLES=OFF -DBUILD_TOOLS=OFF
+          cmake --build build --parallel
+          cmake --install build --prefix /opt/cuframes
+
+      # Clone уже-patched FFmpeg fork с локального gitea (быстро + offline).
+      # Используем ${GITHUB_SERVER_URL} — runner подставит свой view на gitea:
+      # на R9-runner = http://192.168.88.23:3222, на u4-runner = http://10.8.0.6:3222 (VPN).
+      # Hardcoded https://git.goldix.org/... не работает на u4 — нет route к public IP.
+      - name: Clone patched FFmpeg fork (local gitea mirror)
+        run: |
+          git clone --depth 1 --branch n7.1-cuframes \
+              "${GITHUB_SERVER_URL}/gx/ffmpeg-patched.git" /src/ffmpeg
+          ls /src/ffmpeg/libavformat/cuframesdec.c
+
+      - name: Configure FFmpeg (minimal + libcuframes)
+        run: |
+          cd /src/ffmpeg
+          ./configure \
+              --prefix=/opt/ffmpeg \
+              --enable-libcuframes \
+              --extra-cflags="-I/opt/cuframes/include -I/usr/local/cuda/include" \
+              --extra-ldflags="-L/opt/cuframes/lib -L/usr/local/cuda/lib64" \
+              --extra-libs="-lcudart -lpthread -lrt -lm" \
+              --disable-x86asm --disable-everything \
+              --enable-demuxer=cuframes,rawvideo \
+              --enable-decoder=rawvideo \
+              --enable-muxer=null,rawvideo \
+              --enable-protocol=file --enable-ffmpeg \
+              --disable-doc --disable-htmlpages --disable-manpages \
+              --disable-podpages --disable-txtpages
+
+      - name: Build FFmpeg
+        run: |
+          cd /src/ffmpeg
+          make -j$(nproc) ffmpeg
+
+      - name: Verify cuframes demuxer registered
+        run: |
+          export LD_LIBRARY_PATH=/opt/cuframes/lib
+          /src/ffmpeg/ffmpeg -hide_banner -formats | grep cuframes
+          /src/ffmpeg/ffmpeg -hide_banner -h demuxer=cuframes | head -10
@@ -0,0 +1,78 @@
+name: release
+
+# Триггер: push tag v* (e.g. v0.1.0, v0.2.0).
+# Сборка: runtime Docker image + source tarball, прикладываем к gitea release.
+
+on:
+  push:
+    tags:
+      - 'v*'
+
+jobs:
+  docker-runtime:
+    name: build runtime Docker image
+    runs-on: ubuntu-22.04
+    container:
+      image: docker.gitea.com/runner-images:ubuntu-22.04
+      # docker socket нужен — gitea runner монтирует /var/run/docker.sock
+      volumes:
+        - /var/run/docker.sock:/var/run/docker.sock
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Tag from ref
+        id: tag
+        run: |
+          TAG="${GITHUB_REF#refs/tags/v}"
+          echo "version=$TAG" >> $GITHUB_OUTPUT
+
+      - name: Login to gitea registry
+        run: |
+          echo "${{ secrets.GITEA_TOKEN }}" | docker login git.goldix.org \
+              -u "${{ github.actor }}" --password-stdin
+
+      - name: Build runtime image
+        run: |
+          docker build -f docker/Dockerfile.runtime \
+              -t git.goldix.org/gx/cuframes:${{ steps.tag.outputs.version }} \
+              -t git.goldix.org/gx/cuframes:latest \
+              .
+
+      - name: Push
+        run: |
+          docker push git.goldix.org/gx/cuframes:${{ steps.tag.outputs.version }}
+          docker push git.goldix.org/gx/cuframes:latest
+
+  source-tarball:
+    name: build source tarball
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Tag from ref
+        id: tag
+        run: |
+          TAG="${GITHUB_REF#refs/tags/v}"
+          echo "version=$TAG" >> $GITHUB_OUTPUT
+
+      - name: Create tarball
+        run: |
+          VERSION="${{ steps.tag.outputs.version }}"
+          mkdir -p /tmp/release
+          git archive --format=tar.gz --prefix="cuframes-$VERSION/" \
+              -o "/tmp/release/cuframes-$VERSION.tar.gz" HEAD
+          ls -la /tmp/release/
+
+      # Готовый artifact — пользователь скачает с release page либо attached к release.
+      # Gitea release upload через API делается отдельным шагом (см. gitea/release-action
+      # либо curl); тут оставляем артефакт как build output для последующего ручного
+      # attach. Для полной автоматизации — добавить шаг upload через curl + GITEA_TOKEN.
+      - name: Upload tarball as artifact
+        uses: actions/upload-artifact@v3
+        with:
+          name: cuframes-${{ steps.tag.outputs.version }}-source
+          path: /tmp/release/cuframes-*.tar.gz
@@ -0,0 +1,21 @@
+name: test-u4-runner
+
+on:
+  workflow_dispatch:
+  push:
+    paths:
+      - '.gitea/workflows/test-u4-runner.yml'
+
+jobs:
+  hello:
+    name: u4 runner smoke test
+    runs-on: u4
+    container:
+      image: ubuntu:24.04
+    steps:
+      - name: hostname + uname
+        run: |
+          echo "hostname: $(hostname)"
+          echo "uname: $(uname -a)"
+          echo "ip route: $(ip route | head -3)"
+          echo "test OK"
@@ -117,3 +117,95 @@ cd build && cmake -DBUILD_TESTING=ON ..  && cmake --build . && ctest -R stress -
 Production деplo замеры — см. интеграционные guides:
 - [docs/integration.md](docs/integration.md) — cctv-processor C++ pipeline
 - [filter/README.md](filter/README.md) — FFmpeg demuxer (Frigate setup)
+
+---
+
+## Real-world production deployment (2026-05-19, v0.2.0)
+
+**Setup**: 4 Dahua IP-камеры (HEVC main 1920×1080 / 2688×1520, 25 fps) → 3
+одновременных consumer'а на одном RTX 5090 хосте:
+- **Frigate** detect (ONNX D-FINE-S, 640×480) + record (full-res H.265 mp4)
+- **cctv-backend** custom C++ mosaic processor (composes 4×grid → RTSP output для TV)
+
+### Before → after (measured production, идентичный workload)
+
+| Метрика | Без cuframes | С cuframes v0.2 dual-input | Reduction |
+|---|---:|---:|---:|
+| **RTSP connections к камерам** | 12 (4 cam × 3 consumer) | **4** (publishers only) | **−67%** |
+| **NVDEC sessions** | ~8 (decode на каждый consumer) | **4** (publishers only) | **−50%** |
+| **Camera-side bandwidth** | ~34 Mbps (main+main+sub per cam) | **~16 Mbps** (main per cam) | **−54%** |
+| **PCIe D2H copies (consumer side)** | ~346 MB/s (decoded frames → host) | **~0** (zero-copy CUDA IPC) | **−100%** |
+| **Frigate ffmpeg с прямым RTSP** | 8 (detect+record × 4) | **0** (all через cuframes) | **−100%** |
+
+### Live nvidia-smi metrics в running system
+
+```
+GPU SM:     4-5%   (compute: detector + cuframes consumers)
+GPU NVDEC:  2-4%   (без cuframes ожидаемо было 15-25%)
+GPU NVENC:  0-1%
+```
+
+### VRAM breakdown (measured)
+
+| Component | VRAM |
+|---|---:|
+| 4× cuframes publishers (3× FHD ring + 1× 2688×1520 для LPR) | **4.4 GB** |
+| cctv-backend (composer + grid output) | 1.0 GB |
+| frigate.embeddings_manager (face + LPR ONNX models) | 1.6 GB |
+| frigate.detector:onnx (D-FINE-S COCO) | 0.6 GB |
+| **Total cuframes-stack VRAM** | **~7.7 GB** |
+
+Из них на сам cuframes accounting — только **4.4 GB** в publishers (ring buffers +
+NVDEC decode buffers). Consumers (Frigate, cctv-backend) держат свои CUDA
+contexts независимо.
+
+### Network bandwidth (real tcpdump, 10-sec sample)
+
+**31.5 Mbps** от camera subnet (4 cameras → R9), измерено через
+`tcpdump -w cam-traffic.pcap` за 10 секунд.
+
+Breakdown approximate:
+- 4 publishers × main HEVC RTP/UDP: **~16 Mbps** (cuframes core)
+- go2rtc on-demand streams (Frigate UI live preview, если открыт): **0-10 Mbps**
+- ONVIF discovery, RTSP keepalives, NTP-from-cameras: **~1-2 Mbps**
+
+Без cuframes тот же setup (cctv-backend + Frigate detect + Frigate record × 4
+camera) дал бы **~45-50 Mbps** (главное: record path забирал отдельный
+main stream от каждой camera).
+
+### Camera-side benefits
+
+Dahua/Hikvision камеры обычно cap'нуты на 4-5 одновременных RTSP streams.
+До cuframes setup (4 cam × 3 RTSP) делал каждую camera на **60-75% capacity**
+её RTSP server'а. После — **20-25%**, headroom на 2-3 дополнительных
+consumer'а без замены оборудования.
+
+### Что **сохранено** (важно)
+
+- **Качество записи**: record path через `cuframes_packets://` это **passthrough**
+  (`-c:v copy`), bit-exact original encoded stream от камеры. Frigate пишет mp4
+  с full-resolution оригинала, без re-encode.
+- **Latency**: <2 ms publisher → consumer (cuframes IPC) vs ~50-80 ms RTSP setup
+  latency для каждого нового consumer.
+- **Backward compatibility**: v0.2 publishers принимают v1 subscribers
+  (frames-only), rolling upgrade.
+
+### Hardware-agnostic projection (для другого setup)
+
+| If you have | Expected reduction |
+|---|---|
+| 16 cameras × 2 consumers | 32 → 16 NVDEC (−50%), 32 → 16 RTSP (−50%) |
+| 8 cameras × 3 consumers | 24 → 8 NVDEC (−67%), 24 → 8 RTSP (−67%) |
+| 4 cameras × 4 consumers (multi-AI pipeline) | 16 → 4 NVDEC (−75%), 16 → 4 RTSP (−75%) |
+
+Reduction масштабируется **линейно** с N (consumers per camera). v0.1 (frames
+only) сэкономит NVDEC; v0.2 (frames + packets) **дополнительно** сэкономит
+RTSP connections для record/mux consumers.
+
+### Что **НЕ** сэкономлено (честно)
+
+- **Disk space**: запись остаётся full-resolution H.265 mp4. Cuframes не сжимает.
+- **Detector inference latency**: ONNX/TensorRT detector работает на decoded
+  frames независимо от source. Cuframes только меняет где decode произошёл.
+- **Camera RTSP server CPU**: сама камера всё равно encode'ит видео. Cuframes
+  reduces **consumer-side** load, не producer-side.
@@ -5,6 +5,51 @@
 Формат основан на [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 проект следует [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

+## [0.2.0] — 2026-05-19
+
+Encoded packet ring — параллельный канал для record/mux consumer'ов
+без второго RTSP-подключения к камере.
+
+См. issue [#2](https://git.goldix.org/gx/cuframes/issues/2),
+PRs [#4](https://git.goldix.org/gx/cuframes/pulls/4) (cuframes) +
+[gx/ffmpeg-patched#1](https://git.goldix.org/gx/ffmpeg-patched/pulls/1)
+(FFmpeg demuxer).
+
+### Added
+
+- **Encoded packet ring** — параллельный ring для H.264/H.265 NAL units
+  (отдельный SHM `/dev/shm/cuframes-<key>-packets`, variable-length byte
+  buffer + slot index, seqlock-style read для защиты от overrun).
+- **Wire protocol v2** (`proto_version = 2` в SHM header). Backward-compat:
+  v2 publishers принимают v1 subscribers (frames-only).
+- **Public C API** (`include/cuframes/cuframes.h`):
+  - `cuframes_publisher_enable_packets(opts)` — активирует ring
+  - `cuframes_publisher_set_codec_extradata(data, size)` — SPS/PPS
+  - `cuframes_publisher_publish_packet(data, size, pts, dts, flags)`
+  - `cuframes_subscriber_enable_packets()` + `_next_packet()` + accessors
+  - `cuframes_subscriber_get_codec_params(codec_id, extradata, size)`
+- **`cuframes::Publisher`** (C++ RAII): `enable_packets`, `set_codec_extradata`,
+  `publish_packet` методы.
+- **`cuframes-rtsp-source`**: новый CLI flag `--enable-packet-ring`.
+  Дублирует `AVPacket` в encoded ring до передачи декодеру.
+- **FFmpeg demuxer `cuframes_packets://<key>`** (отдельная ветка
+  [gx/ffmpeg-patched PR #1](https://git.goldix.org/gx/ffmpeg-patched/pulls/1)).
+  Companion к `cuframes://`. Use case: Frigate `record` role без
+  второго RTSP к камере.
+- **4 новых error codes**: `PACKET_OVERSIZED`, `NO_PACKET_RING`,
+  `NO_CODEC_PARAMS`, `PACKET_OVERRUN`.
+- **Stress test** `libcuframes/tests/test_packet_ring.c`: 2 scenarios —
+  normal flow (1 pub × 1 sub × 2000 packets, integrity check) +
+  slow consumer (must hit OVERRUN + library auto-resync на keyframe).
+- **Protocol spec §10** в `docs/protocol.md` (397 строк): byte-exact
+  layout, seqlock semantics, late-subscriber GOP-aligned start.
+
+### Limitations (документировано)
+
+- Sub-stream selection отложено в v0.3 (`<key>-substream-<N>` naming).
+- Audio packets — v0.3 (тот же ring layout, codec_id = audio).
+- Codec change mid-stream — требует publisher destroy+recreate.
+
 ## [0.1.0] — 2026-05-17

 Первый функциональный release с production deployment.
@@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 3.20)
 project(cuframes
-    VERSION 0.1.0
+    VERSION 0.3.0
    DESCRIPTION "Zero-copy frame sharing via CUDA IPC"
    LANGUAGES C CXX CUDA
 )
@@ -1,10 +1,15 @@
 # cuframes

+[![build](https://git.goldix.org/gx/cuframes/actions/workflows/build.yml/badge.svg?branch=main)](https://git.goldix.org/gx/cuframes/actions?workflow=build.yml)
+[![release](https://img.shields.io/badge/release-v0.1.0-blue)](https://git.goldix.org/gx/cuframes/releases/tag/v0.1.0)
+[![license](https://img.shields.io/badge/license-LGPL--2.1+-green)](LICENSE)
+
 Zero-copy sharing декодированных видеокадров между процессами через CUDA IPC.

-**Статус:** v0.1 — libcuframes готов, cuframes-rtsp-source готов, e2e-pipeline
-протестирован (4×subscriber × 2000 frames, 0 torn). FFmpeg filter — v0.2.
-**Лицензия:** LGPL-2.1+
+**Статус:** v0.1.0 released — production-deployed на multi-camera CCTV-стeке
+(Frigate + custom C++ processor, оба используют один publisher на одном NVDEC).
+См. [BENCHMARKS.md](BENCHMARKS.md) для measurements, [ROADMAP.md](ROADMAP.md)
+для v0.2 plans.

 ## Минимальные требования

@@ -59,6 +59,36 @@ ETA: 1-2 недели focused работы.
 | Frigate plugin POC (Python side, не FFmpeg) | Альтернативный путь для users которые не хотят патчить FFmpeg |
 | Docker images в public registry | Snapshot CI-built tarballs + multi-arch |

+## Future ideas 💡 (не запланированы, без ETA)
+
+Идеи которые не привязаны к конкретной версии и ждут планирования.
+
+### `gst-cuframes-src` — GStreamer source-element
+
+Аналог FFmpeg-демуксера для GStreamer-стэка. Один publisher cuframes-side → potreбители-pipeline'ы в GStreamer (DeepStream, обычный GStreamer-приложения).
+
+| Зачем | Что |
+|---|---|
+| NVIDIA DeepStream — это GStreamer-native, FFmpeg-демуксер там не работает | `gst-cuframes-src` как `GstBaseSrc`-derived element, выдаёт `GstBuffer` с `GstCudaMemory` (NVMM в Jetson вариант) |
+| GStreamer-приложения (обычный software) | Drop-in source для любой GStreamer pipeline |
+| GStreamer plugin registry | `gst-inspect-1.0 cuframessrc` discoverable |
+
+Open questions: какой memory-type — `memory:CUDAMemory` (mainline) vs `memory:NVMM` (NVIDIA DeepStream-specific). Возможно два варианта/build flags.
+
+### `vf_cuda_grid` — **выделен в отдельный продукт `gx/vf-cuda-grid`** ([repo](https://git.goldix.org/gx/vf-cuda-grid))
+
+FFmpeg filter для GPU-native video grid composition + control-plane sidecar
+(ZeroMQ/MQTT/HTTP/HA Discovery). Дизайн зафиксирован, см.
+[`gx/vf-cuda-grid` docs/design.md](https://git.goldix.org/gx/vf-cuda-grid/src/branch/main/docs/design.md)
+и [epic issue #1](https://git.goldix.org/gx/vf-cuda-grid/issues/1).
+
+Cuframes остаётся frame source provider для vf-cuda-grid в нашей экосистеме
+(но vf-cuda-grid работает и с любым другим CUDA frame source — стандартный FFmpeg).
+
+Закрывает [`gx/cctv#22`](https://git.goldix.org/gx/cctv/issues/22) Phase 4
+(end-to-end GPU pipeline для cctv-processor mosaic composer) после Phase 4 vf-cuda-grid +
+миграция cctv-processor GridComposer → vf_cuda_grid filter.
+
 ## v1.0 — Stable ABI 📋

 - Стабильный wire-protocol (minor versions add fields в reserved space)
@@ -423,3 +423,84 @@ mosaic + RTSP-server. После v1 cuframes:
 3. После Phase 0 — review результатов, корректировка дизайна (если CUDA IPC
   повёл себя не как ожидали)
 4. Phase 1+ по плану
+
+---
+
+# Appendix A — Production deployment notes (post-v0.1.0)
+
+Реальные наблюдения после первого production deployment (Frigate + cctv-processor
+на RTX 5090, 24h+ uptime). Обновляется по мере накопления опыта.
+
+## Что подтвердилось из изначального дизайна
+
+- **CUDA IPC handshake через cudaIpcEventHandle_t работает стабильно** — нет
+  ни одного torn frame за 24+ часов на 2 consumer'ах.
+- **EXTERNAL ownership** (publisher передаёт свои pre-allocated CUDA pointers)
+  необходим для FFmpeg-based publisher — иначе нужен extra cudaMemcpy из FFmpeg's
+  hwframe pool в library-managed pool.
+- **Unix socket handshake** ОК — простой, debug'абельный (`socat` для inspect).
+- **POSIX shm для header + atomic seq counters** — race-free на reader side.
+
+## Что пришлось доработать в v0.1.0 vs initial design
+
+- **CMake install rules** изначально не предусмотрены. Downstream проекты
+  делали `cmake --install` → пустой prefix. Fix: `install(TARGETS ...)` +
+  `install(DIRECTORY include/cuframes ...)`. Лессон — install rules должны
+  быть в day 1.
+- **Variable HINTS в find_library**: пользователи делают install в разные
+  prefix'ы. HINTS для downstream `find_library(cuframes)` должны включать
+  `$PREFIX/lib`, `$PREFIX/lib64`, и `build-dir/libcuframes/` для local-dev.
+
+## Что не учли в дизайне (открытые grабли — см. troubleshooting.md)
+
+### Cross-container CUDA IPC требует **shared pid + ipc namespace**
+
+`cudaIpcOpenEventHandle` validates IPC peer через `/proc/<pid>/...`. Если
+consumer container не в same PID namespace что publisher — fail с
+`invalid device context`.
+
+Это **incompatible** с s6-overlay-based containers (linuxserver.io stack,
+Frigate), требующими PID 1 для self. Workaround: только `ipc:` shared,
+accept race window (works на Frigate в практике потому что подключается
+первым после publisher restart). **Real fix planned v0.2**: socket-based
+context validation вместо `/proc` reliance.
+
+### Publisher-side resize нужен для consumers без cuda-llvm
+
+Большинство downstream FFmpeg builds — без `--enable-cuda-llvm` (на платформах
+с glibc < 2.38 эта опция не собирается, нужен `stdbit.h`). Без cuda-llvm нет
+`scale_cuda` filter. Consumer вынужден CPU-resize либо отключать hwaccel.
+
+**Fix planned v0.2**: publisher принимает `--scale=WxH` и делает GPU resize
+до publish. Consumer получает уже scaled frames, scale_cuda не нужен.
+
+### Encoded packet sharing — отсутствует в v0.1
+
+cuframes v0.1 раздаёт **только decoded** NV12. Для `record` use case
+(`-c:v copy` mux без decode) consumer всё ещё открывает свой RTSP — лимит
+камеры на concurrent streams (4-5 у Dahua) hit'ится.
+
+**v0.2 spec**: parallel encoded-packets ring + `cuframes_packets://`
+demuxer. См. [issue #2](https://git.goldix.org/gx/cuframes/issues/2).
+
+## Production setup (gold path)
+
+```
+                           ┌─► Frigate (FFmpeg cuframes:// demuxer) → detect
+Camera RTSP ─► publisher ──┤
+   (1× NVDEC)               └─► cctv-processor (CuframesSource C++ API) → motion+RTSP-encode→TV
+```
+
+| Метрика | Without cuframes (baseline) | С cuframes v0.1 |
+|---|---|---|
+| NVDEC operations на parking-камеру | 2 (Frigate detect + cctv detect) | **1** (publisher) |
+| VRAM extra cost | 0 (каждый своё) | ~3 MB (ring 6×460KB sub-stream) |
+| RTSP camera load | 2 streams | **1** stream |
+| Uptime (verified) | n/a | 24h+ без drops |
+
+## См. также
+
+- [docs/troubleshooting.md](troubleshooting.md) — конкретные грабли + fixes
+- [BENCHMARKS.md](../BENCHMARKS.md) — измерения
+- [docs/integrations/frigate.md](integrations/frigate.md) — guide для Frigate
+- [ROADMAP.md](../ROADMAP.md) — v0.2/v0.3/v1.0
@@ -1,11 +1,20 @@
 # Integration guide

-Этот guide описывает, как использовать cuframes для устранения дублирующего
-GPU-декодирования между несколькими consumer'ами одного RTSP-потока.
+Хочешь подключить cuframes к своему проекту? Выбери guide по типу integration'а:
+
+## Готовые reference guides
+
+| Тип integration'а | Guide | Reference deployment |
+|---|---|---|
+| **Frigate NVR** (через FFmpeg `cuframes://` demuxer) | [integrations/frigate.md](integrations/frigate.md) | Production: Frigate 0.17.1 + RTX 5090 + Dahua HEVC |
+| **C++ project** (через `CuframesSource` pattern) | [integrations/cctv-cpp.md](integrations/cctv-cpp.md) | Production: [gx/cctv](https://git.goldix.org/gx/cctv) C++17 processor |
+| **Python AI/ML pipeline** (через ctypes wrapper) | [examples/python-consumer/](../examples/python-consumer/) | Skeleton ready; v0.3 даст native bindings |
+| **FFmpeg-based custom tool** (своя сборка ffmpeg) | [filter/README.md](../filter/README.md) | Out-of-tree patch + build instructions |

 ## Целевой сценарий (motivation)

-В типичной CCTV-системе один и тот же RTSP-stream декодируется несколько раз:
+В типичной CCTV / video-analytics системе один и тот же RTSP-поток
+декодируется **несколько раз**:

 ```
   Камера ──► RTSP ──► Frigate          (decode #1: detection + recording)
@@ -13,13 +22,14 @@ GPU-декодирования между несколькими consumer'ами
                  ─►   AI-скрипт        (decode #3: классификация / OCR)
 ```

-На 16 камер × 25 fps × 3 consumer'а = 1200 NVDEC-операций/сек. RTX 5090 имеет
-~3 NVDEC-движка, но шина PCIe и memory bandwidth становятся узким местом.
+На 16 камер × 25 fps × 3 consumer'а = **1200 NVDEC operations/sec**. RTX 5090
+имеет ~3 NVDEC-движка с capacity ~50 FHD25 streams → загрузка близка к лимиту,
+плюс tax на PCIe bandwidth и memory.

 С cuframes:

 ```
-   Камера ──► cuframes-rtsp-source ──► CUDA frame в /dev/shm + cudaIpcEvent
+   Камера ──► cuframes-rtsp-source ──► CUDA frame в VRAM + IPC handles
                                              │
                                              ├──► Frigate          (zero-copy)
                                              ├──► mosaic-сервер    (zero-copy)
@@ -27,242 +37,36 @@ GPU-декодирования между несколькими consumer'ами
 ```

 Decode выполняется **один раз** на источник, потребители получают тот же CUDA
-device pointer без копий.
+device pointer без копий. **3× меньше NVDEC operations** на том же setup'е.

-## Текущие limitations v0.1
+## Текущие ограничения (v0.1)

- **Frigate** (по состоянию на 0.17) **не имеет** plugin-точки для приёма
-  готовых CUDA-frames. Чтобы убрать Frigate decode полностью, нужен:
-  - либо FFmpeg-filter `vf_cuda_ipc_input` (planned для cuframes v0.2 — требует
-    patch FFmpeg upstream и пересборку Frigate's bundled ffmpeg),
-  - либо Frigate-plugin (требует upstream работы с командой Frigate).
- В v0.1 практическое улучшение: **исключить decode для всех custom consumer'ов
-  кроме Frigate** (то есть cctv-processor, AI-скрипты — на cuframes; Frigate
-  остаётся как есть, со своим decode).
+- **Decoded frame sharing only** (не encoded). Для `record` path в Frigate
+  (mux без decode) consumer всё ещё открывает свой RTSP — это решит **v0.2
+  encoded packet sharing** (см. [issue #2](https://git.goldix.org/gx/cuframes/issues/2)).

-Это уже даёт значительную экономию: было 1×Frigate + N×consumer decode'ов,
-стало 1×Frigate + 1×cuframes-rtsp-source (один на все consumer'ы).
+- **NV12 frame format only**. Other formats (YUV420P, RGB) — v0.2.

-## Сценарий 1: cuframes-rtsp-source + cctv-processor (FRIGATE остаётся)
+- **GPU → CPU copy** в FFmpeg demuxer'е (`cudaMemcpy2DAsync`). Zero-copy через
+  `AVHWFramesContext` — v0.2.

-### docker-compose.yml
+- **Cross-container CUDA IPC** требует shared `ipc + pid` namespace. Если
+  consumer использует s6-overlay (как Frigate) — pid не shareable, нужен
+  workaround (см. [integrations/frigate.md](integrations/frigate.md)
+  troubleshooting).

-```yaml
-services:
-  # Один источник на камеру — публикует декодированный поток через cuframes IPC
-  cuframes-cam-parking:
-    image: gx/cuframes-rtsp-source:0.1
-    restart: unless-stopped
-    runtime: nvidia
-    environment:
-      NVIDIA_VISIBLE_DEVICES: all
-      NVIDIA_DRIVER_CAPABILITIES: compute,video,utility
-    # CRITICAL: --ipc=shareable для cross-container CUDA IPC
-    ipc: shareable
-    shm_size: 1g
-    volumes:
-      - cuframes_sock:/run/cuframes
-    command:
-      - --rtsp=rtsp://admin:${CAM_PASS}@192.168.88.98:554/cam/realmonitor?channel=1&subtype=0
-      - --key=cam-parking
-      - --ring=6
-      - --realtime  # не нужен для RTSP (real-time источник), оставлен для file://
+- **Только Linux + NVIDIA GPU** compute capability ≥ 7.5 (Turing+).

-  # Frigate (как и был — со своим decode на main+sub streams)
-  frigate:
-    image: ghcr.io/blakeblackshear/frigate:stable-tensorrt
-    # ... как обычно
+## Production reference deployments

-  # cctv-processor — подписывается на cuframes (без отдельного RTSP decode)
-  cctv-backend:
-    image: gx/cctv-processor:cuda
-    restart: unless-stopped
-    runtime: nvidia
-    # CRITICAL: shared IPC + PID namespace с publisher'ом (см. ниже)
-    ipc: container:cuframes-cam-parking
-    pid: container:cuframes-cam-parking
-    volumes:
-      - cuframes_sock:/run/cuframes:ro
-    environment:
-      # cuframes-keys для backend'а:
-      CCTV_SOURCES: cuframes:cam-parking,cuframes:cam-front-gate,...
+| Setup | Версия | Где смотреть |
+|---|---|---|
+| 1 publisher (1× NVDEC) → Frigate (detect) + cctv-backend (motion+grid→RTSP→TV) | v0.1.0 | [BENCHMARKS.md](../BENCHMARKS.md), [integrations/frigate.md](integrations/frigate.md) |

-volumes:
-  cuframes_sock:
-```
+## Roadmap для v0.2+

-**Важно — оба флага обязательны** для cross-container CUDA IPC:
+Полный roadmap — [ROADMAP.md](../ROADMAP.md). Highlights:

-| Флаг | Зачем |
-|---|---|
-| `ipc: container:<publisher>` | shared `/dev/shm` (нужен для `shm_open` под header/sockets) |
-| `pid: container:<publisher>` | CUDA driver валидирует IPC peer через `/proc/<pid>/...`; без этого `cudaIpcOpenEventHandle` падает с `invalid device context` |
-
-Альтернативы:
- Запускать consumer внутри того же container'а через `docker exec` (наследует все namespaces) — удобно для отладки.
- `--ipc=host --pid=host` — убирает namespacing вообще, но ослабляет изоляцию (не рекомендуется в production).
-
-### Изменения в cctv-processor
-
-Нужно добавить новый Source-тип (рядом с RtspSource) — `CuframesSource`:
-
-```cpp
-// cpp/apps/cctv-processor/src/sources/cuframes_source.hpp
-#include <cuframes/cuframes.hpp>
-
-class CuframesSource : public IVideoSource {
-public:
-    CuframesSource(const std::string &key) : key_(key) {
-        cuframes::SubscriberOptions opt;
-        opt.key = key;
-        opt.consumer_name = "cctv-processor";
-        opt.mode = CUFRAMES_MODE_NEWEST_ONLY;
-        sub_ = std::make_unique<cuframes::Subscriber>(opt);
-        cudaStreamCreate(&stream_);
-    }
-
-    // Вызывается processing-loop'ом
-    std::optional<GpuFrame> nextFrame() override {
-        auto f = sub_->next(stream_, 100);  // 100ms timeout
-        if (!f) return std::nullopt;
-        // cudaStreamWaitEvent уже сделан внутри next() — frame готов на stream_
-        return GpuFrame{
-            .cuda_ptr = f->cuda_ptr(),
-            .width = f->width(),
-            .height = f->height(),
-            .pitch_y = f->pitch_y(),
-            .pitch_uv = f->pitch_uv(),
-            .seq = f->seq(),
-            .pts_ns = f->pts_ns(),
-            .stream = stream_,
-            ._release = std::move(f),  // RAII release при destroy
-        };
-    }
-
-private:
-    std::string key_;
-    std::unique_ptr<cuframes::Subscriber> sub_;
-    cudaStream_t stream_;
-};
-```
-
-Конфиг `cameras.json` — добавить альтернативный source-тип:
-
-```jsonc
-{
-  "cameras": [
-    {
-      "id": "parking",
-      "source_type": "cuframes",          // вместо "rtsp"
-      "cuframes_key": "cam-parking",
-      // rtsp_url больше не нужен — он используется cuframes-rtsp-source'ом
-    }
-  ]
-}
-```
-
-## Сценарий 2: AI-скрипт на Python (subscriber)
-
-Python-bindings — в Phase 3 cuframes. Сейчас простой workaround через
-ctypes:
-
-```python
-import ctypes
-lib = ctypes.CDLL("libcuframes.so")
-# ... wrap нужные функции — см. include/cuframes/cuframes.h
-```
-
-Или: writer simple C-обёртку, которая принимает callback и публикует
-данные через ZMQ / shared memory в python-process.
-
-## Сценарий 3: Замена Frigate decode (v0.2+)
-
-Целевой сценарий — Frigate тоже подписан на cuframes. Реализуется через
-один из двух путей:
-
-### Путь A: FFmpeg filter
-
-Добавить out-of-tree filter `vf_cuda_ipc_input` который читает кадр из
-cuframes ring и эмитит AVFrame в pipeline. Frigate использует ffmpeg для
-RTSP/decode — заменяем "RTSP→decode→detect" на
-"cuframes_ipc_input→detect" (без decode'а вообще).
-
-Требования:
- Patch ffmpeg sources (libavfilter/vf_cuda_ipc_input.c + Makefile)
- Сборка кастомного Frigate-образа с patched ffmpeg
- Тестирование на совместимость с Frigate's pipeline assumptions
-
-### Путь B: Frigate plugin
-
-Engage с upstream Frigate чтобы добавить custom Source-type ("cuframes://").
-Это требует Python-API изменений в Frigate's source layer.
-
-## Verification checklist
-
-После настройки убедитесь:
-
-```bash
-# 1. Publisher запущен и socket существует
-ls -la /run/cuframes/cam-parking.sock
-ls -la /dev/shm/cuframes-cam-parking
-
-# 2. Контейнеры в одном IPC и PID namespace
-docker inspect cuframes-cam-parking cctv-backend \
-  -f '{{.Name}} ipc={{.HostConfig.IpcMode}} pid={{.HostConfig.PidMode}}'
-# Publisher: ipc=shareable pid=(default)
-# Consumer:  ipc=container:cuframes-cam-parking pid=container:cuframes-cam-parking
-
-# 3. Subscriber connect успешен
-docker exec cctv-backend /usr/local/bin/sub_count --key cam-parking --max-frames 10
-# Ожидаем:
-#   [sub_count] connected to 'cuframes-cam-parking'
-#   [sub_count] received=10 gaps=0 elapsed=0.4s avg_fps=25
-
-# 4. NVDEC utilization — должно быть N decodes, а не N*M
-nvidia-smi dmon -s u
-# Колонка %dec должна показать decode-нагрузку одного instance на камеру
-```
-
-## Troubleshooting
-
-### `Subscriber::create: timeout`
-Subscriber не нашёл publisher. Причины:
- Publisher не запущен или crashed — проверь `docker logs cuframes-cam-parking`
- Socket-файл не volumes'нут в consumer-контейнер — добавь `volumes:
-  - cuframes_sock:/run/cuframes:ro` в consumer'е
- IPC namespace не совпадает — см. checklist пункт 2
-
-### `cudaIpcOpenEventHandle: invalid device context`
-Проявляется в **отдельном** consumer-container'е после успешного handshake (socket
-открыт, header валиден, но open event handle не проходит).
-
-Причина: CUDA driver валидирует sender'а IPC peer'а через `/proc`. Если PID
-namespace не совпадает, sender невидим — context считается невалидным.
-
-Fix: добавить `pid: container:<publisher>` в consumer's compose service (рядом
-с `ipc: container:<publisher>`). Проверено на CUDA 13.0 + driver 555+.
-
-### `cudaIpcOpenMemHandle returned 'invalid device pointer'`
- Контейнеры в РАЗНЫХ ipc namespace — должны быть в одном (через
-  `ipc: container:<publisher>` или общий `ipc: shareable`)
- Subscriber работает на другом CUDA device — `--cuda-device` должен совпадать
-  у publisher и subscriber (одно и то же физическое GPU)
-
-### Высокая latency (>50ms tail)
- Subscriber slow — frames копятся в ring, по политике DROP_OLDEST они
-  пропускаются. Используй `CUFRAMES_MODE_NEWEST_ONLY` (default) — это нормально
-  для real-time системы.
- При STRICT_ORDER + STRICT_WAIT — slow consumer блокирует publisher. Не
-  рекомендуется для CCTV.
-
-### Frigate показывает чёрный экран после интеграции
- Frigate не подключён к cuframes (v0.1 — это not yet supported). В v0.1
-  Frigate должен оставаться на своём RTSP decode (см. Сценарий 1).
-
-## Roadmap
-
- **v0.1** (текущая): standalone publisher/subscriber, C/C++ API, examples.
- **v0.2**: FFmpeg filter `vf_cuda_ipc_input` (out-of-tree), Python bindings.
- **v0.3**: NVENC-bridge для re-encode подписчиков, Frigate plugin
-  proof-of-concept.
- **v1.0**: stable ABI, multi-GPU, documented Frigate integration.
+- **v0.2**: encoded packet sharing (Frigate record без второго RTSP), FFmpeg upstream PR, publisher-side resize для устранения scale_cuda dependency
+- **v0.3**: pybind11 Python bindings, Jetson/arm64 support
+- **v1.0**: stable ABI, multi-GPU, env-based credentials
@@ -0,0 +1,309 @@
+# C++ project integration (cctv-processor pattern)
+
+Reference guide на основе реального production deployment
+([gx/cctv](https://git.goldix.org/gx/cctv) — C++17 video processor).
+
+## Use case
+
+Custom video pipeline (motion detection, mosaic compose, encode-out, snapshots),
+получает кадры с N камер и выполняет per-frame processing. Без cuframes:
+один RTSP+NVDEC на каждую камеру **внутри** processor + дублирующий decode
+если Frigate/AI script тоже подключены к той же камере.
+
+С cuframes: processor подписывается на published frames, **никакого RTSP / NVDEC**
+у него — все консьюмеры используют один decode от publisher'а.
+
+## Архитектурный паттерн
+
+Выделить **interface** `IFrameSource` чтобы pipeline не зависел от конкретного
+источника (RTSP vs cuframes vs тестовый file).
+
+```cpp
+// include/sources/IFrameSource.h
+namespace cctv::sources {
+
+enum class ConnectionState {
+    DISCONNECTED, CONNECTING, CONNECTED, RECONNECTING, ERROR
+};
+
+struct StreamInfo {
+    int width = 0;
+    int height = 0;
+    double fps = 0.0;
+    std::string codec_name;
+    int64_t bitrate = 0;
+};
+
+class IFrameSource {
+public:
+    using FrameCallback = std::function<void(const cv::Mat& frame, int64_t ts_ms)>;
+    using StateCallback = std::function<void(ConnectionState, const std::string&)>;
+
+    virtual ~IFrameSource() = default;
+    virtual bool connect(const std::string& url) = 0;
+    virtual void disconnect() = 0;
+    virtual bool isConnected() const = 0;
+    virtual void setFrameCallback(FrameCallback) = 0;
+    virtual void setStateCallback(StateCallback) = 0;
+    virtual void setReconnectEnabled(bool) = 0;
+    virtual StreamInfo getStreamInfo() const = 0;
+    virtual ConnectionState getState() const = 0;
+    virtual std::string getLastError() const = 0;
+    virtual uint64_t getFramesReceived() const = 0;
+    virtual uint64_t getFramesDropped() const = 0;
+    virtual double getCurrentFPS() const = 0;
+};
+
+}  // namespace cctv::sources
+```
+
+`RTSPClient` (legacy) и `CuframesSource` оба implement `IFrameSource`. Pipeline
+работает с `unique_ptr<IFrameSource>` — code не знает, RTSP это или cuframes.
+
+## CuframesSource — реализация
+
+```cpp
+// include/sources/CuframesSource.h
+#include "sources/IFrameSource.h"
+
+// Forward-declare — не утекают в header
+struct cuframes_subscriber;
+typedef struct cuframes_subscriber cuframes_subscriber_t;
+
+namespace cctv::sources {
+
+class CuframesSource : public IFrameSource {
+public:
+    CuframesSource();
+    ~CuframesSource() override;
+
+    // IFrameSource: URL для cuframes — это просто `key` (либо "cuframes://<key>")
+    bool connect(const std::string& url) override;
+    void disconnect() override;
+    // ... остальные методы (см. полный файл в gx/cctv repo)
+
+    void setCudaDevice(int device);
+    void setReconnectInterval(int seconds);
+
+private:
+    void workerThread();
+    bool openSubscriber();
+    void closeSubscriber();
+
+    std::string m_key;
+    int m_cudaDevice = 0;
+    cuframes_subscriber_t* m_sub = nullptr;
+    void* m_cudaStream = nullptr;        // cudaStream_t, opaque
+    void* m_hostBuffer = nullptr;        // pinned host buffer для NV12
+    size_t m_hostBufferSize = 0;
+    std::thread m_thread;
+    std::atomic<bool> m_shouldStop{false};
+    // ... callbacks, state, stats
+};
+
+}  // namespace cctv::sources
+```
+
+### Worker thread (core)
+
+```cpp
+void CuframesSource::workerThread() {
+    while (!m_shouldStop.load()) {
+        if (!m_sub) {
+            if (!openSubscriber()) {
+                changeState(ConnectionState::RECONNECTING, m_lastError);
+                if (!m_reconnectEnabled) return;
+                sleep_for(seconds(m_reconnectInterval));
+                continue;
+            }
+            changeState(ConnectionState::CONNECTED, "");
+        }
+
+        cuframes_frame_t* frame = nullptr;
+        int rc = cuframes_subscriber_next(m_sub, m_cudaStream, &frame, 200);
+
+        if (rc == CUFRAMES_ERR_TIMEOUT) continue;
+        if (rc == CUFRAMES_ERR_DISCONNECTED) {
+            closeSubscriber();
+            changeState(ConnectionState::RECONNECTING, "publisher disconnected");
+            continue;
+        }
+        if (rc != CUFRAMES_OK || !frame) {
+            LOG_ERROR("cuframes next: " + std::string(cuframes_strerror(rc)));
+            closeSubscriber();
+            continue;
+        }
+
+        // Frame metadata
+        int32_t w, h;
+        cuframes_frame_size(frame, &w, &h);
+        const int32_t pitch_y  = cuframes_frame_pitch_y(frame);
+        const int32_t pitch_uv = cuframes_frame_pitch_uv(frame);
+        const int64_t pts_ns   = cuframes_frame_pts_ns(frame);
+
+        // Ensure host buffer big enough
+        const size_t need = (size_t)w * h * 3 / 2;  // NV12 packed
+        if (need > m_hostBufferSize) {
+            cudaFreeHost(m_hostBuffer);
+            cudaMallocHost(&m_hostBuffer, need);
+            m_hostBufferSize = need;
+        }
+
+        // Copy GPU NV12 → host NV12 (Y plane + UV plane)
+        uint8_t* cu = (uint8_t*)cuframes_frame_cuda_ptr(frame);
+        cudaMemcpy2DAsync(m_hostBuffer, w, cu, pitch_y,
+                          w, h, cudaMemcpyDeviceToHost, m_cudaStream);
+        cudaMemcpy2DAsync((uint8_t*)m_hostBuffer + (size_t)w*h, w,
+                          cu + (size_t)pitch_y*h, pitch_uv,
+                          w, h/2, cudaMemcpyDeviceToHost, m_cudaStream);
+        cudaStreamSynchronize(m_cudaStream);
+
+        // Release frame BEFORE downstream processing — publisher может переиспользовать slot
+        cuframes_subscriber_release(m_sub, frame);
+
+        // NV12 → BGR (CPU) — downstream pipeline ожидает cv::Mat BGR
+        cv::Mat nv12(h * 3 / 2, w, CV_8UC1, m_hostBuffer);
+        cv::Mat bgr;
+        cv::cvtColor(nv12, bgr, cv::COLOR_YUV2BGR_NV12);
+
+        // Доставка через callback
+        if (m_frameCallback) m_frameCallback(bgr, pts_ns / 1000000);
+    }
+    closeSubscriber();
+}
+```
+
+`cudaMemcpy → CPU → cv::cvtColor` это v0.1 path. **Zero-copy** через
+`AVHWFramesContext` / OpenCV cv::cuda::GpuMat — planned v0.2.
+
+## Factory pattern (per-camera)
+
+```cpp
+// В StreamProcessor::initializeComponents()
+for (const auto& camera : cameras) {
+    if (!camera.enabled) continue;
+
+    std::unique_ptr<sources::IFrameSource> source;
+    if (camera.source_type == "cuframes") {
+        source = std::make_unique<sources::CuframesSource>();
+    } else {
+        source = std::make_unique<rtsp::RTSPClient>();  // legacy RTSP
+    }
+
+    source->setFrameCallback([this, id = camera.id](const cv::Mat& frame, int64_t ts) {
+        m_videoProcessor->processFrame(id, frame);
+    });
+    source->setStateCallback([this, id = camera.id](auto state, const std::string& msg) {
+        // logging, alerting, watchdog
+    });
+    source->setReconnectEnabled(true);
+
+    m_frameSources[camera.id] = std::move(source);
+}
+```
+
+В `start()` — отдельный цикл:
+
+```cpp
+for (const auto& camera : cameras) {
+    if (!camera.enabled) continue;
+    auto& src = m_frameSources[camera.id];
+    const std::string url = (camera.source_type == "cuframes")
+        ? camera.cuframes_key
+        : camera.rtsp_url;
+    src->connect(url);
+}
+```
+
+## CMake integration
+
+```cmake
+# cmake/Dependencies.cmake
+if(ENABLE_CUDA AND CUDA_AVAILABLE)
+    find_path(CUFRAMES_INCLUDE_DIR cuframes/cuframes.h
+        HINTS ${CUFRAMES_ROOT}/include /usr/local/include /usr/include
+    )
+    find_library(CUFRAMES_LIBRARY cuframes
+        HINTS ${CUFRAMES_ROOT}/lib ${CUFRAMES_ROOT}/lib64
+              /usr/local/lib /usr/lib
+    )
+    if(CUFRAMES_INCLUDE_DIR AND CUFRAMES_LIBRARY)
+        set(CUFRAMES_FOUND TRUE)
+        find_package(CUDAToolkit REQUIRED)
+        message(STATUS "cuframes: FOUND (${CUFRAMES_LIBRARY})")
+    else()
+        message(STATUS "cuframes: NOT FOUND (camera source_type=cuframes недоступен)")
+    endif()
+endif()
+```
+
+```cmake
+# apps/your-processor/CMakeLists.txt
+if(CUFRAMES_FOUND)
+    target_include_directories(your-processor PRIVATE ${CUFRAMES_INCLUDE_DIR})
+    target_link_libraries(your-processor PRIVATE ${CUFRAMES_LIBRARY} CUDA::cudart)
+    target_compile_definitions(your-processor PRIVATE CCTV_HAVE_CUFRAMES=1)
+endif()
+```
+
+`CuframesSource.cpp` оборачивается в `#ifdef CCTV_HAVE_CUFRAMES` — без cuframes
+в системе фабрика возвращает error при `source_type == "cuframes"`, остальное
+компилируется как обычно.
+
+## Config
+
+`cameras.json` extension:
+
+```json
+{
+  "cameras": [
+    {
+      "id": 1,
+      "name": "Парковка через cuframes",
+      "source_type": "cuframes",
+      "cuframes_key": "cam-parking",
+      "rtsp_url": "",
+      "enabled": true,
+      "motion_detection": { "enabled": false, ... }
+    },
+    {
+      "id": 2,
+      "name": "Камера на RTSP",
+      "rtsp_url": "rtsp://admin:pw@cam-ip:554/stream",
+      "enabled": true
+    }
+  ]
+}
+```
+
+## Runtime requirements
+
+Consumer container/process должен:
+1. Иметь доступ к `/run/cuframes` (volume mount от publisher'а).
+2. Быть в **same** IPC namespace (для `/dev/shm` shared) — `ipc: container:<publisher>`.
+3. Быть в **same** PID namespace (для CUDA driver IPC validation) — `pid: container:<publisher>` (если consumer не имеет PID-1-strict init типа s6-overlay).
+4. Иметь NVIDIA runtime — `runtime: nvidia` в compose.
+5. Запускаться с правом доступа к socket (по умолчанию root) — `user: root` в compose.
+
+Пример compose service:
+
+```yaml
+your-cctv-backend:
+  image: your-image:cuda
+  runtime: nvidia
+  user: root  # socket в publisher container root-owned
+  ipc: "container:cuframes-pub-parking"
+  pid: "container:cuframes-pub-parking"  # если ваш image не использует s6
+  environment:
+    NVIDIA_VISIBLE_DEVICES: all
+    NVIDIA_DRIVER_CAPABILITIES: compute,video,utility
+  volumes:
+    - cuframes_sock:/run/cuframes:ro
+```
+
+## См. также
+
+- [filter/README.md](../../filter/README.md) — FFmpeg demuxer (если ваш processor построен на FFmpeg)
+- [docs/integrations/frigate.md](frigate.md) — Frigate-specific guide
+- [docs/architecture.md](../architecture.md) — внутренности CUDA IPC
+- [Полный код CuframesSource](https://git.goldix.org/gx/cctv/src/branch/enterprise/develop/cpp/apps/cctv-processor/src/sources/CuframesSource.cpp) — реальный production-tested файл
@@ -0,0 +1,364 @@
+# Frigate integration
+
+Полный production-tested guide для интеграции cuframes с
+[Frigate NVR](https://github.com/blakeblackshear/frigate). На основе реального
+deployment (Frigate 0.17.1-tensorrt + RTX 5090 + Dahua HEVC камеры).
+
+## Что вы получаете
+
+- **Один NVDEC decode на камеру** вместо одного у Frigate + одного у каждого
+  другого consumer'а (cctv-processor, AI-скрипт, mosaic-сервер).
+- Frigate видит decoded frames через **обычный FFmpeg URL** — никакого fork'а
+  Frigate-кода. Frigate сам не подозревает что под капотом cuframes.
+
+## Что вы НЕ получаете в v0.1
+
+- **Record path** (`-c:v copy` для архива) — этот path в Frigate всё ещё через
+  свой отдельный RTSP. v0.2 cuframes решит это через encoded packet sharing
+  (см. [issue #2](https://git.goldix.org/gx/cuframes/issues/2)).
+- Hwaccel CUDA filters для detect resize (`scale_cuda`) — наш minimal FFmpeg
+  собран без `--enable-cuda-llvm` (не работает на glibc < 2.38 что у Debian 12,
+  на котором Frigate base). Workaround: `hwaccel_args: []` в config → CPU
+  scale (cost ~5-10% CPU на FHD25).
+
+## Архитектура
+
+```
+Camera RTSP ──► cuframes-rtsp-source ──► [NVDEC ─► NV12 in CUDA IPC]
+                                              │
+                                              ├──► Frigate (ffmpeg -f cuframes) → detect
+                                              ├──► cctv-processor (CuframesSource) → motion+mosaic
+                                              └──► AI-script (Python ctypes) → inference
+```
+
+## Требования
+
+| | Минимум | Note |
+|---|---|---|
+| NVIDIA driver | 555+ | для CUDA 12 runtime |
+| CUDA Toolkit (для build patched FFmpeg) | 12.4+ | host или builder container |
+| GPU compute capability | ≥ 7.5 | требование CUDA IPC |
+| OS на target (Frigate runtime) | Debian 12 bookworm | glibc 2.36 — это база Frigate `stable-tensorrt` |
+| OS на builder | Ubuntu 22.04 (glibc 2.35) | forward-compat с Debian 12 |
+| docker buildx | latest | для multi-stage build |
+
+## Шаг 1 — Build patched Frigate image
+
+Cuframes integration требует patched FFmpeg внутри Frigate с `cuframes://`
+demuxer. Самый простой путь — собрать overlay image поверх existing Frigate.
+
+### 1.1. Минимальный Dockerfile (Debian 12 builder + custom FFmpeg)
+
+```dockerfile
+# Build patched FFmpeg на Debian 12 (glibc-совместимо с Frigate runtime)
+FROM debian:bookworm AS builder
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential cmake git nasm pkg-config ca-certificates wget patch ninja-build \
+    libssl-dev libx264-dev libx265-dev libnuma-dev zlib1g-dev \
+    libfreetype-dev libfribidi-dev libharfbuzz-dev libfontconfig-dev \
+    libvpx-dev libopus-dev libmp3lame-dev libvorbis-dev libtheora-dev libwebp-dev \
+    libaom-dev libdav1d-dev libsvtav1enc-dev \
+    libssh-dev librist-dev libsrt-openssl-dev \
+    libdrm-dev libva-dev libxcb1-dev \
+ && rm -rf /var/lib/apt/lists/*
+
+# CUDA toolkit 12.x
+RUN wget -q https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/cuda-keyring_1.1-1_all.deb \
+ && dpkg -i cuda-keyring_1.1-1_all.deb && rm cuda-keyring_1.1-1_all.deb \
+ && apt-get update && apt-get install -y --no-install-recommends cuda-toolkit-12-6 \
+ && rm -rf /var/lib/apt/lists/*
+ENV PATH=/usr/local/cuda/bin:$PATH
+
+# nv-codec-headers (для FFmpeg ffnvcodec/nvenc/nvdec)
+RUN git clone --depth 1 --branch n12.2.72.0 https://github.com/FFmpeg/nv-codec-headers.git /tmp/nvc \
+ && make -C /tmp/nvc install && rm -rf /tmp/nvc
+
+# Build libcuframes (static install в /opt/cuframes)
+RUN git clone --depth 1 https://git.goldix.org/gx/cuframes.git /src/cuframes \
+ && cmake -B /src/cuframes/build -S /src/cuframes -G Ninja \
+        -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTING=OFF \
+        -DBUILD_EXAMPLES=OFF -DBUILD_TOOLS=OFF \
+ && cmake --build /src/cuframes/build -j"$(nproc)" \
+ && cmake --install /src/cuframes/build --prefix /opt/cuframes
+
+# Clone patched FFmpeg fork (либо upstream + apply patch — см. filter/README.md)
+RUN git clone --depth 1 --branch n7.1-cuframes \
+        https://git.goldix.org/gx/ffmpeg-patched.git /src/ffmpeg
+
+# Configure (minimal-but-functional для Frigate)
+RUN cd /src/ffmpeg && ./configure \
+        --prefix=/opt/ffmpeg \
+        --enable-gpl --enable-version3 --enable-nonfree \
+        --enable-libcuframes \
+        --enable-libx264 --enable-libx265 \
+        --enable-libvpx --enable-libopus --enable-libmp3lame \
+        --enable-libvorbis --enable-libtheora --enable-libwebp \
+        --enable-libaom --enable-libdav1d --enable-libsvtav1 \
+        --enable-libfreetype --enable-libfribidi --enable-libharfbuzz \
+        --enable-libssh --enable-librist --enable-libsrt \
+        --enable-openssl \
+        --enable-ffnvcodec --enable-cuvid --enable-nvenc --enable-nvdec \
+        --extra-cflags="-I/opt/cuframes/include -I/usr/local/cuda/include" \
+        --extra-ldflags="-L/opt/cuframes/lib -L/usr/local/cuda/lib64" \
+        --extra-libs="-lcudart -lpthread -lrt -lm" \
+        --disable-doc --disable-htmlpages --disable-manpages
+RUN cd /src/ffmpeg && make -j"$(nproc)" && make install
+
+# ─── Runtime: Frigate + наши binaries поверх ──────────────────────────
+FROM ghcr.io/blakeblackshear/frigate:stable-tensorrt
+
+# Missing dynamic .so которые требует наш patched ffmpeg (Frigate image их не имеет —
+# bundled статически собран без них в DT_NEEDED)
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        libharfbuzz0b libfribidi0 librist4 libsrt1.5-openssl libssh-4 \
+        libvpx7 libwebpmux3 libwebp7 libdav1d6 libaom3 libmp3lame0 \
+        libsvtav1enc1 libtheora0 libvorbis0a libvorbisenc2 \
+        libx264-164 libx265-199 libopus0 \
+ && rm -rf /var/lib/apt/lists/*
+
+# Replace bundled ffmpeg (оригинал backup'нем под .orig)
+RUN cp /usr/lib/ffmpeg/7.0/bin/ffmpeg  /usr/lib/ffmpeg/7.0/bin/ffmpeg.orig \
+ && cp /usr/lib/ffmpeg/7.0/bin/ffprobe /usr/lib/ffmpeg/7.0/bin/ffprobe.orig
+COPY --from=builder /opt/ffmpeg/bin/ffmpeg  /usr/lib/ffmpeg/7.0/bin/ffmpeg
+COPY --from=builder /opt/ffmpeg/bin/ffprobe /usr/lib/ffmpeg/7.0/bin/ffprobe
+COPY --from=builder /opt/cuframes/lib/libcuframes.so.0.1.0 /usr/local/lib/
+RUN cd /usr/local/lib && ln -sf libcuframes.so.0.1.0 libcuframes.so.0 \
+ && ln -sf libcuframes.so.0 libcuframes.so && ldconfig
+
+# Build-time smoke: ldd resolved + cuframes demuxer registered
+RUN ldd /usr/lib/ffmpeg/7.0/bin/ffmpeg | grep -q "not found" && exit 1 || true
+RUN /usr/lib/ffmpeg/7.0/bin/ffmpeg -hide_banner -formats | grep -q cuframes \
+ && echo "OK: cuframes demuxer registered in Frigate image"
+```
+
+Build:
+```bash
+docker build -t local/frigate-cuframes:latest -f Dockerfile.frigate .
+```
+
+Размер ~10 GB (наследует Frigate `stable-tensorrt` ~9 GB).
+
+## Шаг 2 — docker-compose: publisher + Frigate
+
+```yaml
+services:
+  # Один publisher на камеру — единственный source RTSP, делает 1× NVDEC.
+  cuframes-pub-parking:
+    image: git.goldix.org/gx/cuframes:0.1   # либо local build из filter/Dockerfile.runtime
+    container_name: cuframes-pub-parking
+    restart: unless-stopped
+    runtime: nvidia
+    # CRITICAL: ipc=shareable — Frigate и другие consumers подсоединяются через
+    # ipc: container:cuframes-pub-parking
+    ipc: shareable
+    shm_size: 256m
+    environment:
+      NVIDIA_VISIBLE_DEVICES: all
+      NVIDIA_DRIVER_CAPABILITIES: compute,video,utility
+    volumes:
+      - cuframes_sock:/run/cuframes
+    command:
+      - /usr/local/bin/cuframes-rtsp-source
+      - --rtsp
+      - "rtsp://admin:${CAM_PASS}@cam-parking-ip:554/cam/realmonitor?channel=1&subtype=1"
+      - --key
+      - cam-parking
+      - --ring
+      - "6"
+      - --verbose
+
+  frigate:
+    image: local/frigate-cuframes:latest
+    container_name: frigate
+    restart: unless-stopped
+    depends_on:
+      cuframes-pub-parking:
+        condition: service_started
+    runtime: nvidia
+    privileged: true
+    shm_size: 512m
+    # CUDA IPC c publisher'ом: shared /dev/shm
+    # WARN: pid намерено НЕ share'ится — Frigate использует s6-overlay,
+    # которое требует PID 1 в своём namespace.
+    ipc: "container:cuframes-pub-parking"
+    environment:
+      FRIGATE_RTSP_PASSWORD: "${FRIGATE_RTSP_PASSWORD}"
+      NVIDIA_VISIBLE_DEVICES: all
+      NVIDIA_DRIVER_CAPABILITIES: compute,video,utility
+    ports:
+      - "5000:5000"
+      - "8971:8971"
+    volumes:
+      - cuframes_sock:/run/cuframes:ro
+      - ./config/config.yml:/config/config.yml:ro
+      - /home/user/frigate-media:/media/frigate
+      # ... остальные volumes как обычно
+
+volumes:
+  cuframes_sock:
+```
+
+## Шаг 3 — Frigate config.yml
+
+Ключевые отличия от стандартного config:
+
+```yaml
+ffmpeg:
+  # ВАЖНО: hwaccel cuda отключаем (наш ffmpeg без cuda-llvm → нет scale_cuda).
+  # Detect-path использует CPU scale, но decode уже done у publisher'а.
+  hwaccel_args: []
+  output_args:
+    record: preset-record-generic-audio-aac
+
+cameras:
+  parking_overview:
+    enabled: true
+    ffmpeg:
+      inputs:
+        # main (full-res) — только запись в архив через прямой RTSP
+        # (decode у Frigate НЕ происходит — это `-c:v copy` мux)
+        - path: rtsp://admin:${FRIGATE_RTSP_PASSWORD}@cam-parking-ip:554/cam/realmonitor?channel=1&subtype=0
+          roles: [record]
+
+        # sub-stream → через cuframes (decoded у publisher'а, без второго NVDEC у Frigate)
+        - path: cuframes://cam-parking
+          input_args: -f cuframes
+          roles: [detect]
+    detect:
+      width: 640
+      height: 480
+      fps: 5
+```
+
+После v0.2 cuframes (encoded packet sharing) record-path тоже мoжет
+переключиться на `cuframes_packets://cam-parking` — тогда **никакого RTSP в
+Frigate config'е вообще**.
+
+## Шаг 4 — Run + verify
+
+```bash
+docker compose up -d
+docker logs -f frigate
+```
+
+Что искать в logs:
+- `[INFO] Camera processor started for parking_overview` — normal startup
+- НЕТ `[ERROR] Ffmpeg process crashed` — если есть, посмотри
+  [Troubleshooting](#troubleshooting)
+- В `nvidia-smi dmon -s u` колонка `%dec` должна показывать ~1-2% на одну
+  камеру (это publisher), Frigate сам не decode'ит cuframes input
+
+```bash
+# Проверить что Frigate реально читает cuframes:
+docker exec frigate ps -ef | grep ffmpeg | grep cuframes
+# Должна быть линия вида:
+#   ffmpeg ... -f cuframes -i cuframes://cam-parking -r 5 -vf fps=5,scale=640:480 ...
+```
+
+## Troubleshooting
+
+### `s6-overlay-suexec: fatal: can only run as pid 1`
+
+Появляется если попытались добавить `pid: container:cuframes-pub-parking` в
+Frigate service. Frigate's s6-overlay strict требует PID 1.
+
+**Fix**: убрать `pid:` из compose. Если только `ipc:` shared — большинство
+случаев работают (Frigate подсоединяется первым и его CUDA context служит
+для последующих).
+
+**Альтернатива**: запустить Frigate с собственным namespace но дублировать
+publisher socket через bind-mount. Frigate сам управляется first CUDA context.
+
+### `[AVFilterGraph] No such filter: 'scale_cuda'`
+
+Frigate config имеет `hwaccel_args: preset-nvidia` (default). Наш patched
+ffmpeg собран без `--enable-cuda-llvm` (не работает на glibc < 2.38). Эта
+опция компилирует CUDA filters, включая `scale_cuda`.
+
+**Fix**: `hwaccel_args: []` в config.yml. CPU scale (5-10% CPU per FHD25 камера).
+
+**Real fix** (planned): cuframes v0.2 — publisher сам делает resize до detect-size
+и публикует pre-scaled frames. Тогда Frigate не нуждается в scale_cuda.
+
+### `cudaIpcOpenEventHandle: invalid device context`
+
+Consumer container не имеет shared pid namespace с publisher'ом → CUDA driver
+не валидирует IPC peer.
+
+**Fix для cross-container CUDA IPC**: `pid: container:<publisher>` + `ipc:
+container:<publisher>`. Для Frigate этот fix недоступен (см. предыдущий пункт).
+Workaround — поднять Frigate первым после publisher (race window) или использовать
+encoded packet path (v0.2).
+
+### `Nonmatching transport in server reply` от RTSP-output Frigate
+
+Не относится к cuframes — это нормальное поведение Frigate's go2rtc для
+TCP transport. TV/VLC обычно использует UDP — оно работает.
+
+## v0.2: dual-input (detect + record через один RTSP)
+
+После cuframes v0.2 publisher активирует **encoded packet ring** параллельно
+с decoded frames ring. Это даёт Frigate одновременно:
+
+- `cuframes://<key>` — **decoded NV12** для `detect` role (как в v0.1)
+- `cuframes_packets://<key>` — **encoded H.264/H.265** для `record` role
+  (passthrough, без decode)
+
+→ **1 RTSP connection** к камере вместо 2-3 (Frigate сейчас открывает
+отдельный stream для record).
+
+### Setup
+
+```bash
+cuframes-rtsp-source \
+  --rtsp rtsp://admin:pw@192.168.88.98/cam/realmonitor?channel=1 \
+  --key cam-parking \
+  --enable-packet-ring
+```
+
+Publisher держит **два** SHM:
+- `/dev/shm/cuframes-cam-parking`         (decoded NV12, v0.1)
+- `/dev/shm/cuframes-cam-parking-packets` (encoded packets, v0.2)
+
+### Frigate config
+
+```yaml
+cameras:
+  cam_parking:
+    ffmpeg:
+      inputs:
+        - path: cuframes://cam-parking
+          input_args: -f cuframes
+          roles: [detect]
+        - path: cuframes_packets://cam-parking
+          input_args: -f cuframes_packets
+          roles: [record]
+```
+
+### Requirements
+
+- Patched FFmpeg с обоими demuxer'ами:
+  [gx/ffmpeg-patched PR #1](https://git.goldix.org/gx/ffmpeg-patched/pulls/1).
+- Frigate Dockerfile перекомпилирован с этим ffmpeg (см. секцию выше про
+  `cuframes-frigate:0.17` build).
+
+### Trade-offs
+
+| Метрика | v0.1 (frames only) | v0.2 (frames + packets) |
+|---|---|---|
+| RTSP к камере | 1 (publisher) | 1 (publisher) |
+| Frigate-side RTSP | 1+ (record отдельно) | **0** — всё через cuframes |
+| Camera RTSP streams | 2+ | **1** |
+| Доп. VRAM | ring (~10 MB) | без изменений |
+| Доп. host RAM | минимум | + 8 MB на packet ring |
+| Доп. CPU | nominal | nominal (memcpy в shared ring) |
+
+## См. также
+
+- [filter/README.md](../../filter/README.md) — детали FFmpeg demuxer + patch
+- [docs/integration.md](../integration.md) — общий integration guide
+- [docs/protocol.md §10](../protocol.md#10-v02-extension-encoded-packet-ring-proto_version2) — wire-protocol spec для packet ring
+- [BENCHMARKS.md](../../BENCHMARKS.md) — production-measured результаты
+- [ROADMAP.md](../../ROADMAP.md) — v0.3+ planned features
@@ -0,0 +1,47 @@
+# Launch drafts
+
+Drafts для outreach / launch. Все — **draft material**, перед отправкой review.
+
+## Порядок (рекомендуемый)
+
+1. **`frigate-integration-issue.md`** — soft-launch, низкий риск отказа, целевая
+   аудитория уже жалуется на проблему в 3 discussion'ах. Может дать первых
+   early-adopter'ов и social proof для следующего шага.
+2. **`ffmpeg-devel-rfc.md`** — после того как Frigate-discussion получит
+   позитивный engagement (даже один "+1, would use" комментарий — уже traction).
+   Mailing-list FFmpeg-devel предъявляет высокий стандарт; готовиться тщательно.
+3. **`hn-show-post.md`** — финальный, после того как либо RFC получит первый
+   response, либо ясно что молчат. HN — это amplifier, не starting line.
+
+## Что в каждом draft
+
+| Файл | Куда | Формат | Когда |
+|---|---|---|---|
+| [`frigate-integration-issue.md`](frigate-integration-issue.md) | github.com/blakeblackshear/frigate | Discussion (Ideas category) | Сейчас |
+| [`ffmpeg-devel-rfc.md`](ffmpeg-devel-rfc.md) | `ffmpeg-devel@ffmpeg.org` | Patch + cover letter via `git send-email` | После Frigate engagement |
+| [`hn-show-post.md`](hn-show-post.md) | news.ycombinator.com | Show HN | Etap F (finale) |
+
+## Что **не** делать
+
+- Не публиковать всё сразу в один день — невозможно отвечать на all-channels параллельно.
+- Не публиковать в выходные / праздники / во время большого tech-event (Apple keynote, GTC, etc).
+- Не упоминать "AI", "battle-tested", "production-ready", "enterprise" в тексте — все эти аудитории (FFmpeg-devel, Frigate, HN) аллергичны к маркетинговому языку.
+- Не публиковать FFmpeg patch **без** sign-off — automatic rejection.
+- Не отправлять HN-пост если не можешь быть онлайн первые 2 часа после публикации — ранжирование умрёт.
+
+## Что подготовить перед отправкой
+
+- [ ] Subscribe на ffmpeg-devel (https://ffmpeg.org/mailman/listinfo/ffmpeg-devel) — иначе reply'ы не получишь
+- [ ] `git config --global` для send-email (см. ffmpeg-devel-rfc.md шаги)
+- [ ] Sign-off в FFmpeg commit (`git commit --amend -s` если ещё нет)
+- [ ] GitHub аккаунт для Frigate discussion (если нет уже)
+- [ ] HN аккаунт с пара дней истории — fresh accounts автоматически шадо-банятся
+
+## После отправки
+
+Следить за reply'ями в течение первой недели. Все три канала — асинхронные, но первые **48 часов** обычно решающие.
+
+Куда смотреть статус engagement:
+- ffmpeg-devel: https://ffmpeg.org/pipermail/ffmpeg-devel/
+- Frigate discussion: появится в правой панели repo
+- HN: https://news.ycombinator.com/threads?id=YOURUSER
@@ -0,0 +1,160 @@
+# FFmpeg-devel RFC submission
+
+**Status:** DRAFT — review перед отправкой.
+
+**Куда:** `ffmpeg-devel@ffmpeg.org` (subscribe: https://ffmpeg.org/mailman/listinfo/ffmpeg-devel)
+
+**Как:** patch генерится через `git format-patch`, отправляется `git send-email` с cover-letter. FFmpeg **не использует** GitHub PR / pull-request — только mailing-list patches.
+
+---
+
+## Шаги отправки
+
+```bash
+# 1. Конфигурация git send-email (один раз)
+git config --global sendemail.smtpserver smtp.gmail.com
+git config --global sendemail.smtpserverport 587
+git config --global sendemail.smtpencryption tls
+git config --global sendemail.smtpuser ВАШ-EMAIL
+# password — через ~/.netrc или интерактивно
+
+# 2. На fork ffmpeg-patched, в ветке n7.1-cuframes:
+cd /path/to/ffmpeg-patched
+git log --oneline n7.1..n7.1-cuframes  # должна быть одна commit
+
+# 3. Подготовить .patch
+git format-patch -1 --cover-letter --subject-prefix='RFC PATCH' \
+    --output-directory=/tmp/cuframes-rfc \
+    n7.1..n7.1-cuframes
+
+# 4. Отредактировать /tmp/cuframes-rfc/0000-cover-letter.patch:
+#    - Заменить *** SUBJECT HERE *** → см. ниже
+#    - Заменить *** BLURB HERE *** → cover-letter body (см. ниже)
+
+# 5. Dry-run
+git send-email --dry-run --to=ffmpeg-devel@ffmpeg.org /tmp/cuframes-rfc/*.patch
+
+# 6. Реальная отправка
+git send-email --to=ffmpeg-devel@ffmpeg.org /tmp/cuframes-rfc/*.patch
+```
+
+## Subject line
+
+```
+[RFC PATCH 0/1] libavformat/cuframesdec: zero-copy CUDA frame ingest via IPC
+```
+
+## Cover-letter body
+
+```
+Hi all,
+
+This RFC adds a new demuxer "cuframes" to libavformat that ingests already-
+decoded video frames residing in CUDA device memory, produced by another
+process via the libcuframes IPC layer [1].
+
+# Why
+
+In multi-consumer GPU video pipelines (CCTV with multiple analytics
+services, multi-stream transcoding farms, ML inference + recording on the
+same source) every consumer typically runs its own NVDEC session. On 16
+cameras × 25 fps × N consumers this multiplies NVDEC sessions, OS
+context-switches and host<->device PCIe traffic for what is logically the
+same decoded frame.
+
+cuframes addresses this by letting one process decode (e.g. via FFmpeg's
+existing CUDA hwaccel) and publish the decoded frames into a small CUDA
+ring buffer; other processes import the buffer via cudaIpcOpenMemHandle
+and consume the same VRAM allocation without redecoding or copying.
+
+The libavformat demuxer in this RFC is the consumer side: it exposes the
+remote ring buffer as a regular AVFormat input source, so any downstream
+FFmpeg filter chain or muxer can use it transparently.
+
+# Scope of this patch
+
+  libavformat/cuframesdec.c   — new demuxer
+  libavformat/allformats.c    — registration
+  configure                   — --enable-libcuframes option
+
+The demuxer currently outputs NV12 frames via cudaMemcpy2DAsync to host
+memory (rawvideo path). A v0.2 follow-up is planned that emits frames
+directly as CUDA AVHWFramesContext (true zero-copy into a CUDA-aware
+filter chain) — see [2].
+
+# Out-of-tree library
+
+libcuframes (the producer side, the IPC handshake, the ring-buffer
+allocator) lives out-of-tree at [1], licensed LGPL-2.1+ to match FFmpeg.
+The demuxer links against libcuframes via pkg-config.
+
+This mirrors the model used by other libavformat plugins that wrap third-
+party libraries (libsmbclient, librist, libsrt, etc.).
+
+# Testing
+
+- Unit smoke tests in the libcuframes repo (1 publisher × 4 subscribers ×
+  2000 frames @ 120 fps — 0 torn frames, 0 gaps).
+- E2E test against a real RTSP IP camera (Dahua HEVC 1920×1080, 25 fps,
+  100/100 frames, avg_fps=25.03).
+- ~24h production deployment serving Frigate (object detection) and a
+  custom analytics pipeline from a single decoder, single NVDEC session.
+
+# Prior art and what this is not
+
+There is no in-tree mechanism for sharing decoded GPU frames between
+unrelated FFmpeg processes. Existing alternatives are:
+  - CUDA hwdownload + hwupload (defeats the purpose — round-trips via PCIe)
+  - DeepStream Gst-nvstreammux (NVIDIA, closed, GStreamer-only)
+  - Vendor-locked NVENC/NVDEC pooling helpers
+
+cuframes is intentionally minimal: ring buffer + handshake + IPC handles.
+No transcoding logic, no policy.
+
+# Limitations / known issues for review
+
+  - NVIDIA GPUs only (CUDA IPC is vendor-specific).
+  - Linux only (POSIX SHM + AF_UNIX sockets).
+  - Producer and consumer must share the same CUDA device (CUDA IPC limit).
+  - NV12 only in v0.1; other pixel formats are roadmap items.
+  - Driver ≥ 525, CUDA toolkit ≥ 12.0 (≥ 13.0 recommended).
+
+# Feedback wanted
+
+  1. Is the libavformat demuxer the right home for this, or would a
+     hwcontext_cuda extension + a thin demuxer be a better split?
+  2. Are folks open to an out-of-tree library dependency under
+     --enable-libcuframes, given the precedent of librist/libsrt?
+  3. Naming: "cuframes" vs "cudaipcframes" vs something else?
+
+Happy to iterate. Patch follows.
+
+[1] https://git.goldix.org/gx/cuframes  (LGPL-2.1+)
+[2] https://git.goldix.org/gx/cuframes/issues/2  (v0.2 zero-copy plan)
+
+Signed-off-by: <YOUR NAME> <YOUR EMAIL>
+```
+
+## Notes на review
+
+- **Subject prefix `[RFC PATCH]`** — потому что это design discussion, не "merge this now". Если получите конструктивный feedback и сделаете revision — следующая будет `[PATCH v2]`.
+- **Sign-off обязателен** — иначе patch отклонят на уровне tooling.
+- **Не упоминать** "production-ready", "battle-tested", "30 days of uptime" — FFmpeg-devel список **очень** аллергичен на маркетинговый тон. Numbers OK, эпитеты нет.
+- **Не CC** maintainers без приглашения — ответят те, кому интересно. Можно CC Timo Rothenpieler (CUDA hwaccel maintainer) если хочется ускорить — но **только** после первого revision если тишина.
+- Возможные возражения:
+  - "Why not Vulkan video?" — Vulkan video не имеет cross-process sharing API на уровне CUDA IPC. Vulkan external memory работает с DMA-BUF на Linux но требует DRM device sharing, что тоже non-trivial — отдельный RFC материал.
+  - "Why a new demuxer, not a filter?" — потому что producer уже **вне** этого FFmpeg-процесса; demuxer — это место где AVFormat читает из внешнего источника. Filter pull'ает из upstream AVStream — здесь нет upstream.
+
+## Альтернативный путь — ffmpeg-user (lighter)
+
+Если кажется что для `-devel` сразу с patch'ем тяжело — можно начать с **awareness email** в `ffmpeg-user@ffmpeg.org`:
+
+```
+Subject: ANNOUNCE: libcuframes — zero-copy CUDA frame sharing for FFmpeg pipelines
+
+[3 параграфа: what / why / link to repo]
+
+Patch для libavformat будет отправлен в -devel список после feedback от пользователей.
+```
+
+Это **soft launch** — мень рисков отказа, больше шансов получить early adopters которые потом support'ят RFC. Рекомендую этот шаг **сначала**.
@@ -0,0 +1,115 @@
+# Frigate integration issue
+
+**Status:** DRAFT — review перед публикацией.
+
+**Куда:** https://github.com/blakeblackshear/frigate
+
+**Тип:** GitHub **Discussion** (category: Ideas), **не** Issue. Причина: это feature proposal, не баг. Frigate активно использует discussions (см. [#17033](https://github.com/blakeblackshear/frigate/discussions/17033), [#20191](https://github.com/blakeblackshear/frigate/discussions/20191), [#21559](https://github.com/blakeblackshear/frigate/discussions/21559) — все три уже жалуются на эту проблему).
+
+**Альтернатива:** ответить в одной из существующих discussion'ов о NVDEC saturation. Может быть лучше — там уже собралась audience.
+
+---
+
+## Title
+
+```
+[Ideas] Reduce NVDEC duplication on multi-consumer cameras via shared CUDA frame buffer (cuframes)
+```
+
+## Body
+
+```markdown
+## Problem
+
+When Frigate co-exists with other GPU-using video consumers on the same
+camera stream (separate AI processor, custom analytics, recording to a
+second NVR, etc.), each process opens its own NVDEC session and decodes
+the same H.264/HEVC stream independently. On 16+ cameras at 25 fps this
+becomes the bottleneck on consumer GPUs:
+
+- NVDEC sessions are limited (4 concurrent on RTX 30xx/40xx, more on
+  workstation cards). Decoder context creation / destruction is not free.
+- Each duplicate decode burns PCIe bandwidth pushing the same NV12 frame
+  to host memory (in setups that go through `hwdownload`).
+- Power draw and thermals scale with redundant decoding.
+
+Related discussions: #17033, #20191, #21559.
+
+## Existing workarounds
+
+- Single Frigate restream and have everything else pull from go2rtc — works
+  for re-encoding to TCP/UDP, but every downstream still re-decodes.
+- DeepStream `nvstreammux` — solves it but is closed-source NVIDIA stack,
+  GStreamer-only, not co-installable with current Frigate ffmpeg pipeline.
+
+## Proposal: cuframes ingest source
+
+[cuframes](https://git.goldix.org/gx/cuframes) (LGPL-2.1+) is a small
+library that lets one process decode once into a CUDA ring buffer and any
+number of other processes import that buffer via CUDA IPC and consume
+**zero-copy** in VRAM.
+
+Concretely for Frigate this would mean a new ffmpeg input source like:
+
+```yaml
+cameras:
+  driveway:
+    ffmpeg:
+      inputs:
+        - path: cuframes://driveway
+          input_args: preset-cuframes
+          roles: [detect]
+```
+
+where a sentinel container (one per camera, ~5MB RAM, runs
+`cuframes-rtsp-source`) does the actual RTSP pull + NVDEC and Frigate
+attaches to that pre-decoded stream.
+
+## Working integration (early proof)
+
+I've been running this in production for ~24h: a single
+`cuframes-rtsp-source` container per camera serves both Frigate
+(detection role) **and** a separate C++ analytics pipeline from the same
+NVDEC session. Frigate gets pre-decoded NV12 frames; no detection or
+recording behaviour was changed.
+
+Integration guide with full docker-compose and a patched Frigate Dockerfile:
+https://git.goldix.org/gx/cuframes/src/branch/main/docs/integrations/frigate.md
+
+## What I'm asking for
+
+Not a PR yet — first I'd like maintainer / community input on:
+
+1. Would Frigate be open to **upstream** a `cuframes://` input source, or
+   should this stay a third-party patched Frigate image?
+2. If upstream — what's the preferred shape: new ffmpeg preset only
+   (zero core code changes), or a first-class `decoder: cuframes` option
+   in the Frigate config schema?
+3. The cuframes library currently requires `--ipc` and `--pid` namespace
+   sharing between producer and consumer containers. Frigate uses
+   `s6-overlay` which is incompatible with `--pid` share (s6 needs PID 1).
+   The current integration uses a small race-window workaround
+   ([troubleshooting #2](https://git.goldix.org/gx/cuframes/src/branch/main/docs/troubleshooting.md));
+   a cleaner solution requires either making s6 optional in the Frigate
+   image or moving the IPC handshake to a sidecar pattern.
+
+## Limitations of cuframes (full disclosure)
+
+- NVIDIA GPUs only.
+- Linux only.
+- Producer + consumer must share the same CUDA device.
+- NV12 frame format only in v0.1.
+- Requires patching FFmpeg with a small (~400 LOC) demuxer; an upstream
+  FFmpeg RFC is in flight separately.
+
+If this looks worth pursuing I'm happy to open a draft PR against a feature
+branch and iterate.
+```
+
+## Notes на review
+
+- **Tone:** Frigate maintainer (Blake) ценит конкретику и production proof — без них любой feature request кладётся в backlog. У нас есть production proof (24h+) — это сильный аргумент, использован прямо.
+- **Не обещаем upstream без request'а** — спрашиваем discussion'ом, не PR'ом. Если Blake скажет "не наш scope, оставайтесь third-party" — это OK; integration guide уже валиден как standalone.
+- **Прозрачно про s6-overlay constraint** — это блокирующий issue для clean upstream'а. Лучше упомянуть сразу чем спрятать и получить отказ через 2 недели review.
+- **Линки на 3 existing discussions** — показывает что problem подтверждена сообществом, не наша одинокая боль.
+- **Не упоминать другие AI-системы** (ANPR, face recognition итд) — Blake уже несколько раз говорил что Frigate scope = детектор и NVR, не platform. Подача "cuframes решает вашу проблему" работает лучше чем "cuframes построит экосистему".
@@ -0,0 +1,107 @@
+# Show HN post (для Etap F — позже)
+
+**Status:** DRAFT — не публикуем сейчас. Этот файл черновик к Etap F (launch).
+
+**Куда:** https://news.ycombinator.com/submit
+
+**Когда публиковать:**
+- После того как FFmpeg-devel RFC получит первый response (даже отказ — это traction)
+- ИЛИ после того как Frigate discussion получит +5 upvotes / 3+ комментариев
+- ИЛИ если оба молчат 2 недели — публиковать в любом случае, HN-аудитория более независимая
+- **Время:** будний день, 13:00-15:00 UTC (peak HN traffic from US morning + EU afternoon)
+- **Не публиковать** в пятницу вечером / в выходные / в крупный tech-event день (Apple keynote, GTC, etc.) — drown'ит в шуме
+
+---
+
+## Title
+
+Опции (выбрать одну):
+
+1. `Show HN: Cuframes – zero-copy sharing of decoded video frames between processes via CUDA IPC`
+2. `Show HN: Stop redecoding the same RTSP stream in every consumer`
+3. `Show HN: Cuframes – one NVDEC, many consumers, zero-copy in VRAM`
+
+Рекомендую **#2** — describes problem in 7 words, HN любит problem-first titles. #1 — для технической HN ниши тоже OK.
+
+## Body
+
+```markdown
+Hi HN,
+
+I run a homelab CCTV stack with 16 cameras feeding into Frigate (object
+detection), a custom C++ analytics service, and a recording NVR. All three
+were running NVDEC on the same RTSP streams. On an RTX 3060 this saturated
+the decoder slots and the consumer GPUs in my office burnt about 40W of
+redundant decoding when nothing interesting was happening.
+
+So I wrote a small library that lets one process decode the stream once
+into a CUDA ring buffer and the others import the same buffer via
+cudaIpcOpenMemHandle. Decoded NV12 frame lands in VRAM exactly once, every
+consumer reads it zero-copy.
+
+Repo (LGPL-2.1+): https://git.goldix.org/gx/cuframes
+
+What's in it:
+
+  - libcuframes — the producer/consumer C/C++ library
+  - cuframes-rtsp-source — standalone RTSP → cuframes bridge (one per cam)
+  - A small out-of-tree FFmpeg demuxer ("cuframes://") so downstream
+    consumers don't need to know they're consuming shared frames
+  - Reference docker-compose for the Frigate + custom-app setup
+  - 24h production deployment on the homelab, ~25 fps × 16 cameras × 3
+    consumers from a single NVDEC session
+
+What surprised me along the way:
+
+  - CUDA IPC handles are bound to the device that allocated them, not just
+    a CUDA context — both peers must be on the same GPU. (Documented;
+    bit out of the way in the Programming Guide §3.2.8.)
+  - Cross-container CUDA IPC needs both --ipc and --pid namespace share,
+    not just --ipc. The latter wasn't obvious from the error message
+    ("invalid device context" with no mention of /proc visibility).
+  - Frigate's s6-overlay is incompatible with --pid share because s6
+    insists on being PID 1. There's a documented race-window workaround
+    but it's the one rough edge.
+
+What it is not:
+
+  - Not a transcoding framework. No re-encoding, no filtering, no policy.
+  - Not multi-GPU (CUDA IPC is single-device).
+  - Not Windows / macOS / WSL2 / AMD.
+
+What's next:
+
+  - Upstream FFmpeg RFC for the demuxer (drafted, not sent yet — would
+    appreciate review of the RFC text first).
+  - v0.2 makes the FFmpeg path true zero-copy via AVHWFramesContext (no
+    cudaMemcpy2DAsync round-trip).
+
+Happy to answer questions. Especially interested in:
+
+  - Anyone running multi-consumer GPU video pipelines with a different
+    solution? Curious what tradeoffs you hit.
+  - Vulkan-video folks: is there an obvious cross-process sharing path
+    via VkExternalMemory + DMA-BUF that I'm missing? I went CUDA-only
+    because that's what worked first, but Vulkan would be vendor-neutral.
+
+— [your handle]
+```
+
+## Notes на review
+
+- **HN формат:** первая строка — hook (concrete problem, concrete numbers — "40W redundant decoding"). НЕ начинать с "Hi everyone, today I'm excited to share..."
+- **Без emoji**, без markdown headers (HN не renders'ит markdown в title-area; body тоже почти plain text)
+- **Конкретные числа** — HN respect'ит numbers. "40W", "24h", "25 fps × 16 cam × 3 consumer", "~400 LOC patch"
+- **"What it is not"** — отсекает Vue Apologists которые иначе пишут "why don't you support Windows?". Это HN best practice
+- **Open questions внизу** — driver discussion. Без них первый комментарий = "и зачем это?". С ними — "вот мой опыт с DeepStream"
+- **Avoid:** "battle-tested", "production-ready", "enterprise-grade", "10x faster than X" — HN crowd специально downvotes такое
+- **Будь готов** отвечать **первые 2 часа** активно — HN ранжирование сильно зависит от engagement в первый час. Если не сможешь быть в офлайне — не публикуй
+- **Если автор — не main maintainer** repo — упомянуть это в первом комменте от собственного аккаунта чтобы не выглядело как третье-лицо PR
+
+## Альтернатива — r/selfhosted
+
+Если HN кажется слишком high-stakes, можно сначала **r/selfhosted** (180k subs) — там Frigate-аудитория, прямой fit. Менее brutal, легче получить early feedback.
+
+Title для reddit: `Reduced NVDEC saturation across Frigate + custom apps by sharing decoded frames over CUDA IPC — open-sourced the library`
+
+Этот текст короче (HN body слишком длинный для reddit), но идея та же.
@@ -423,3 +423,400 @@ TEST(Handshake, HelloRespMismatchProto) {
 `libcuframes/src/protocol.c` (Phase 1, Step 2) — единственная reference.
 Любая другая реализация (Python ctypes, Rust bindings, FFmpeg plugin)
 должна **conformance-tested** против этого документа.
+
+## 10. v0.2 extension: encoded packet ring (proto_version=2)
+
+**Статус:** design draft, ещё не реализовано (см. issue #2).
+
+Параллельно с decoded-frames ring (§2) publisher может опционально
+поддерживать **encoded packet ring** — публикует raw H.264/H.265 NAL units
+**до** decoder, для consumer'ов которые делают `-c:v copy` (recording, mux).
+
+### 10.1 Совместимость с v1
+
+- v2 publisher принимает **v1-subscribers** — они получают только frames
+  ring (как v0.1), packet ring им не показывается.
+- v1 publisher отвергает v2-subscribers с `wants_packets=true`
+  (HELLO_RESP error PROTOCOL).
+- v1 layout (§2) **не меняется** для frames ring — packet ring это отдельный SHM.
+
+Publisher version bumping:
+- `proto_version` = 2 в SHM header и в HELLO_RESP когда packet ring active.
+- Если publisher v2 не активирует packet ring (`enable_packet_ring=false`)
+  — `proto_version` остаётся 1 (полная v1 compat).
+
+### 10.2 Дополнительные ресурсы
+
+| Resource | Path | Назначение | Когда |
+|---|---|---|---|
+| Packet shared memory | `/dev/shm/cuframes-<key>-packets` | Packet ring header + slots + byte buffer | если publisher активировал packet ring |
+
+Cleanup — симметрично §1: `shm_unlink` при destroy(); orphaned автоматически
+если nobody mmap'ит.
+
+### 10.3 Packet ring layout
+
+Размер пакетного SHM: `sizeof(packet_ring_header_t) + N×PSE + DATA_SIZE`,
+где:
+- N = `packet_ring_slots`, default 64 (configurable)
+- PSE = `sizeof(packet_slot_entry_t)` = 64 байт (см. §10.5)
+- DATA_SIZE = `packet_data_size`, default 8 MB (configurable)
+
+#### Byte layout
+
+```
+Offset                  Size   Field                       Comments
+─────────────────────── ────── ──────────────────────────  ─────────────────────────────
+0x0000                       4 magic (LE u32)              0xCC7C1DCD  (frames magic + 1)
+0x0004                       4 proto_version (LE u32)      2
+0x0008                       4 ring_slots (LE u32)         N (1..1024)
+0x000C                       4 data_size (LE u32)          bytes for packet data ring
+0x0010                       4 codec_id (LE u32)           AV_CODEC_ID_* enum (см. §10.4)
+0x0014                       4 codec_extradata_size (LE u32)  ≤ 4096
+0x0018                       8 producer_pid (LE u64)
+0x0020                       8 global_seq (LE u64, atomic) монотонная по packets
+0x0028                       8 last_keyframe_seq (LE u64, atomic)  для late subscribers
+0x0030                       8 write_offset (LE u64, atomic) текущий cursor в data ring
+0x0038                       8 shutdown_flag (LE u64, atomic)
+0x0040                    4096 codec_extradata             SPS/PPS/VPS bytes (см. §10.4)
+0x1040                    N×64 slots[N]                    packet_slot_entry_t (см. §10.5)
+0x1040+N×64           DATA_SIZE data[]                     wraparound byte buffer
+```
+
+Все atomic fields — C11 `_Atomic` (release/acquire semantics для seq updates).
+
+### 10.4 Codec extradata
+
+H.264 — SPS + PPS, конкатенированные в **Annex B** формате
+(start codes `00 00 00 01`). H.265 — VPS + SPS + PPS.
+
+`codec_id` соответствует FFmpeg `AV_CODEC_ID_H264`, `AV_CODEC_ID_HEVC`,
+`AV_CODEC_ID_AV1` (future). Subscriber пишет этот extradata в
+`AVCodecContext.extradata` своего decoder'а (если он его создаёт)
+или в `AVStream.codecpar->extradata` для muxer'ов.
+
+Extradata устанавливается publisher'ом **один раз** при первом keyframe
+(или из RTSP SDP до первого packet). После — fixed на lifetime publisher'а
+(codec change mid-stream → publisher destroy+recreate с новым `<key>`).
+
+### 10.5 Packet slot entry (64 байта)
+
+```
+Offset  Size   Field                       Comments
+0x00      8    seq (LE u64, atomic)        published seq; UINT64_MAX = invalid
+0x08      8    pts_ns (LE i64)
+0x10      8    dts_ns (LE i64)             для B-frames pipelines
+0x18      8    data_offset (LE u64)        offset в `data[]` секции SHM
+0x20      4    data_size (LE u32)          size of payload bytes
+0x24      4    flags (LE u32)              §10.6
+0x28     24    reserved                    0
+```
+
+`data_offset` может быть **больше** `data_size` секции SHM — semantics
+"absolute byte cursor", фактический byte index = `data_offset % data_size`.
+Subscriber может detect wrap (если payload crosses end → split read).
+
+### 10.6 Packet flags
+
+```
+Bit  Name              Comments
+0    KEY               keyframe (IDR for H.264, или CRA/IDR для HEVC).
+                       Critical для late subscribers — must wait IDR.
+1    CORRUPT           publisher detect'нул что packet damaged
+                       (RTP loss и т.п.) — subscriber может skip
+2    DISCONTINUITY     был gap перед этим packet
+                       (publisher reconnect к камере)
+3    LAST_IN_AU        last NAL в access unit (полный frame)
+                       — для muxer'ов которые ждут полный frame
+4-31 reserved          0
+```
+
+Mapping в `AVPacket.flags`:
+- bit 0 (KEY) → `AV_PKT_FLAG_KEY`
+- bit 1 (CORRUPT) → `AV_PKT_FLAG_CORRUPT`
+- bit 2 (DISCONTINUITY) → `AV_PKT_FLAG_DISCONTINUITY` (FFmpeg 5+)
+
+### 10.7 Atomic publish (publisher-side)
+
+```c
+// Pseudo-C (упрощено, без error handling)
+uint64_t seq = atomic_load(&hdr->global_seq, RELAXED) + 1;
+uint64_t off = atomic_load(&hdr->write_offset, RELAXED);
+
+// 1. Найти free slot (overwrite oldest)
+size_t slot_idx = seq % hdr->ring_slots;
+packet_slot_entry_t *slot = &slots[slot_idx];
+
+// 2. Записать payload bytes (wraparound, может потребовать 2 memcpy)
+size_t off_in_ring = off % hdr->data_size;
+size_t first_chunk = min(size, hdr->data_size - off_in_ring);
+memcpy(data + off_in_ring, payload, first_chunk);
+if (first_chunk < size)
+    memcpy(data, payload + first_chunk, size - first_chunk);
+
+// 3. RELEASE: записать metadata в slot
+slot->pts_ns = pts;
+slot->dts_ns = dts;
+slot->data_offset = off;
+slot->data_size = size;
+slot->flags = flags;
+atomic_store(&slot->seq, seq, RELEASE);
+
+// 4. Update global cursor + global_seq
+atomic_store(&hdr->write_offset, off + size, RELEASE);
+atomic_store(&hdr->global_seq, seq, RELEASE);
+
+// 5. If KEY → update last_keyframe_seq
+if (flags & PKT_FLAG_KEY)
+    atomic_store(&hdr->last_keyframe_seq, seq, RELEASE);
+```
+
+### 10.8 Atomic read (subscriber-side)
+
+```c
+// Pseudo-C
+uint64_t cur = atomic_load(&hdr->global_seq, ACQUIRE);
+if (cur <= my_last_seq)  return TIMEOUT;  // ничего нового
+
+uint64_t want_seq = my_last_seq + 1;
+size_t slot_idx = want_seq % hdr->ring_slots;
+packet_slot_entry_t *slot = &slots[slot_idx];
+
+uint64_t slot_seq = atomic_load(&slot->seq, ACQUIRE);
+if (slot_seq != want_seq) {
+    // overrun — slow subscriber. Re-anchor:
+    want_seq = atomic_load(&hdr->last_keyframe_seq, ACQUIRE);
+    slot_idx = want_seq % hdr->ring_slots;
+    slot = &slots[slot_idx];
+    return DROPPED;  // signal user через flags = DISCONTINUITY
+}
+
+// Copy payload (wraparound aware)
+uint64_t off = slot->data_offset % hdr->data_size;
+uint32_t size = slot->data_size;
+uint32_t first_chunk = min(size, hdr->data_size - off);
+memcpy(out_buf, data + off, first_chunk);
+if (first_chunk < size)
+    memcpy(out_buf + first_chunk, data, size - first_chunk);
+
+// Re-check slot->seq не изменился (защита от overrun mid-read)
+if (atomic_load(&slot->seq, ACQUIRE) != want_seq) {
+    return DROPPED;  // publisher overwrote во время copy
+}
+
+my_last_seq = want_seq;
+return OK;
+```
+
+Защита от overrun mid-read через **post-check `slot->seq`** — простая
+вариант seqlock. Если publisher успел overwrite между metadata-read и
+data-copy — subscriber detect и retry.
+
+### 10.9 Socket protocol extensions
+
+#### HELLO_REQ — добавляются flags в reserved field
+
+v1 layout (§3.3):
+```
+[4 bytes]  proto_version
+[4 bytes]  consumer_name_len
+[N bytes]  consumer_name
+[4 bytes]  cuda_device
+[4 bytes]  mode
+[12 bytes] reserved (must be 0)  ← v0.2 использует первые 4 байта
+```
+
+v0.2 интерпретирует первые 4 байта `reserved` как `subscribe_flags`:
+
+| Bit | Name | Comments |
+|---|---|---|
+| 0 | `WANTS_FRAMES` | подписаться на decoded frames ring (default ON в v1 — implicit) |
+| 1 | `WANTS_PACKETS` | подписаться на encoded packet ring |
+| 2-31 | reserved | 0 |
+
+Если v1-subscriber оставляет reserved=0 — publisher v2 интерпретирует это
+как `WANTS_FRAMES=true, WANTS_PACKETS=false` (v1 backward-compat).
+
+#### HELLO_RESP — добавляются packet-ring fields
+
+v1 layout (§3.4) расширяется в reserved секции:
+
+```
+[4 bytes]  result
+[4 bytes]  proto_version_actual         ← теперь может быть 1 или 2
+[4 bytes]  ring_size                    ← frames ring
+[4 bytes]  ownership_mode
+[64 bytes] frame_meta
+[4 bytes]  shm_path_len                 ← frames SHM
+[N bytes]  shm_path
+[12 bytes] reserved                     ← v0.2 интерпретирует
+```
+
+v0.2 reserved layout (если `proto_version_actual == 2` И publisher
+поддерживает packets):
+```
+[4 bytes]  packet_shm_path_len (LE u32) 0 = packets disabled at publisher
+[N bytes]  packet_shm_path (UTF-8)      — относительно /dev/shm/, например "cuframes-camA-packets"
+[4 bytes]  codec_id (LE u32)            AV_CODEC_ID_*
+[4 bytes]  initial_packet_seq (LE u64)  last_keyframe_seq на момент handshake
+                                        (subscriber должен start с этого seq)
+```
+
+Если subscriber запросил `WANTS_PACKETS=1` но publisher не имеет packet ring
+— `result = ERR_NOT_AVAILABLE`.
+
+### 10.10 Subscriber state machine extension
+
+Подключение к **обоим** rings (или одному из):
+
+```
+   ┌──────────┐
+   │ HELLO_OK │ proto_version_actual=2, packet_shm_path_len>0
+   └────┬─────┘
+        │
+        ▼
+   ┌────────────────────────────────┐
+   │ Open frames SHM (если WANTS_FRAMES) │ → standard v1 flow
+   └────────────────────────────────┘
+        │
+        ▼
+   ┌────────────────────────────────┐
+   │ Open packet SHM (если WANTS_PACKETS) │
+   │ - mmap /dev/shm/cuframes-<key>-packets │
+   │ - check magic, proto_version            │
+   │ - set my_last_packet_seq = initial_packet_seq - 1 │
+   │   (так что первый next_packet вернёт IDR)         │
+   └────────────────────────────────┘
+        │
+        ▼
+   ┌─────────┐
+   │ READY   │ — frames или packets или оба доступны
+   └─────────┘
+```
+
+### 10.11 Threading в subscriber
+
+Frames ring и packet ring имеют **разные** `global_seq` counters.
+Subscriber имеет **отдельные** `my_last_seq` для каждого. Может
+poll'ить обе независимо (или через два threads).
+
+Producer's `cudaEventRecord` (frames sync) не релевантен для packets —
+encoded data на CPU, без CUDA sync.
+
+### 10.12 Конфигурируемость packet ring
+
+Publisher API extension (§10.13) принимает параметры:
+
+```c
+typedef struct {
+    uint32_t packet_ring_slots;   // default 64
+    uint32_t packet_data_size;    // default 8 MB (8388608)
+    uint32_t max_packet_size;     // default 2 MB — sanity guard для оversized
+                                  // packets (publisher rejects with error)
+    uint32_t codec_id;            // AV_CODEC_ID_H264 / HEVC / ...
+} cuframes_packet_ring_options_t;
+```
+
+### 10.13 API extension (для cuframes.h)
+
+```c
+/* Сreate publisher с активным packet ring. NULL для opts → packet ring disabled. */
+int cuframes_publisher_create_ex(
+    const cuframes_publisher_options_t *frames_opts,
+    const cuframes_packet_ring_options_t *packet_opts, /* NULL = no packet ring */
+    cuframes_publisher_t **pub_out
+);
+
+/* Set codec extradata (SPS/PPS) — должен быть called до первого publish_packet. */
+int cuframes_publisher_set_codec_extradata(
+    cuframes_publisher_t *pub,
+    const void *extradata,
+    size_t size
+);
+
+/* Публикация packet. Slow consumer = overwrite oldest. */
+int cuframes_publisher_publish_packet(
+    cuframes_publisher_t *pub,
+    const void *data,
+    size_t size,
+    int64_t pts_ns,
+    int64_t dts_ns,
+    uint32_t flags   /* CUFRAMES_PKT_FLAG_KEY | _CORRUPT | _DISCONTINUITY | _LAST_IN_AU */
+);
+
+/* Subscriber-side: подписаться с opt-in для packets. */
+typedef struct {
+    /* ... existing v1 fields ... */
+    uint32_t subscribe_flags;  /* WANTS_FRAMES, WANTS_PACKETS bits */
+} cuframes_subscriber_options_v2_t;
+
+int cuframes_subscriber_create_v2(
+    const cuframes_subscriber_options_v2_t *opts,
+    cuframes_subscriber_t **sub_out
+);
+
+/* Чтение packet. Opaque handle — каллер вызывает release_packet после. */
+typedef struct cuframes_packet cuframes_packet_t;
+
+int cuframes_subscriber_next_packet(
+    cuframes_subscriber_t *sub,
+    cuframes_packet_t **pkt_out,
+    int32_t timeout_ms
+);
+
+const void * cuframes_packet_data(const cuframes_packet_t *p);
+size_t       cuframes_packet_size(const cuframes_packet_t *p);
+int64_t      cuframes_packet_pts(const cuframes_packet_t *p);
+int64_t      cuframes_packet_dts(const cuframes_packet_t *p);
+uint32_t     cuframes_packet_flags(const cuframes_packet_t *p);
+
+int cuframes_subscriber_release_packet(cuframes_subscriber_t *sub, cuframes_packet_t *p);
+
+/* Codec params для subscriber (extracted из shared header). */
+int cuframes_subscriber_get_codec_params(
+    cuframes_subscriber_t *sub,
+    uint32_t *codec_id_out,
+    const void **extradata_out,
+    size_t *extradata_size_out
+);
+```
+
+`cuframes_packet_t` opaque — фактически указатель в local-mapped data (на
+heap subscriber'а — copy при `next_packet`, освобождение при `release`).
+Subscriber **не** держит ссылки на shared ring data между `next_packet` и
+`release_packet` — это избавляет от reader-locks.
+
+### 10.14 Late subscriber → keyframe-aligned start
+
+При SUBSCRIBE_RESP publisher отвечает `initial_packet_seq = last_keyframe_seq`.
+
+Subscriber устанавливает `my_last_seq = initial_packet_seq - 1`, так что
+первый `next_packet` вернёт keyframe (decoder может start без glitches).
+
+**Risk:** если в момент handshake **last_keyframe_seq уже выехал из
+ring** (slow start subscriber, GOP > ring_slots packets) — subscriber
+detect overrun в первом read и переходит на следующий keyframe.
+
+В implementation `publisher_publish_packet` для оптимизации может маркировать
+slot перед IDR как **persistent** (флаг в reserved), но **v0.2 keep simple** —
+просто требуем что `packet_ring_slots × avg_packet_size > GOP_size_in_bytes`
+для нормальной работы. Sizing guide см. в [docs/integration.md](integration.md).
+
+### 10.15 Error codes (новые)
+
+| Code | Name | Когда |
+|---|---|---|
+| -20 | `CUFRAMES_ERR_PACKET_OVERSIZED` | publish_packet с size > max_packet_size |
+| -21 | `CUFRAMES_ERR_NO_PACKET_RING` | subscriber запросил packets, publisher без packet ring |
+| -22 | `CUFRAMES_ERR_NO_CODEC_PARAMS` | get_codec_params вызван до set_codec_extradata publisher'ом |
+| -23 | `CUFRAMES_ERR_PACKET_OVERRUN` | subscriber slow — packet seq уехал, надо resync на keyframe |
+
+### 10.16 Open для v0.3+
+
+- **Sub-stream selection** — publisher может публиковать несколько
+  packet rings (для multi-resolution streams). Сейчас один key = один stream.
+  v0.3 → `<key>-substream-<N>` naming?
+- **Codec change mid-stream** — текущий design требует publisher restart.
+  Future: invalidate codec_extradata + bump generation field.
+- **Audio streams** — analogichno в packet ring, но codec_id = audio (AAC,
+  Opus). v0.3.
@@ -181,4 +181,36 @@ Phase 0 PoC (2026-05-14):
 - **Docker:** 29.1.3 с nvidia-container-runtime
 - **Container:** Ubuntu 24.04 + GCC 13 + Clang + CMake 3.28 + Ninja

-Дополнительный target matrix будет в CI после Phase 4.
+## Production deployment matrix (v0.1.0)
+
+Что подтверждено в 24h+ production run:
+
+| Слой | Версия | Comments |
+|---|---|---|
+| NVIDIA driver | 555+ | минимум для CUDA 12 user runtime |
+| CUDA toolkit (build) | 12.4 (Debian 12 / Ubuntu 22.04) либо 13.0 (Ubuntu 24.04) | toolkit для builder image, не runtime |
+| GPU | RTX 5090 (sm_120) | проверено; раньше — sm_75 минимум |
+| Builder OS | Ubuntu 22.04 (glibc 2.35) | forward-compat с Debian 12 runtime |
+| Runtime OS (Frigate) | Debian 12 (glibc 2.36) | base image Frigate `stable-tensorrt` |
+| Runtime OS (cctv-backend) | Ubuntu 22.04 либо Debian 12 | matched с builder |
+| Docker | 29.1.x | для buildx |
+| docker buildx | v0.34.0+ | `apt install docker-buildx-plugin` либо manual install из GH releases |
+| nvidia-container-toolkit | 1.14+ | для `runtime: nvidia` |
+
+## Docker namespace requirements (cross-container CUDA IPC)
+
+Для consumer'а который подключается к publisher'у в **другом** container'е:
+
+| Что нужно | Как настроить |
+|---|---|
+| `/dev/shm` shared (header + ring metadata) | `ipc: container:<publisher>` либо `ipc: shareable` у publisher + same у consumer |
+| `/proc` visibility (CUDA IPC peer validation) | `pid: container:<publisher>` |
+| `/run/cuframes/*.sock` доступен | volume mount `cuframes_sock:/run/cuframes:ro` |
+| GPU access | `runtime: nvidia` + `NVIDIA_VISIBLE_DEVICES=all` |
+| Socket file permissions | `user: root` либо chmod в publisher |
+
+**Все 5** должны быть выполнены. Подробности — [docs/troubleshooting.md](troubleshooting.md).
+
+**Special case: s6-overlay containers (Frigate, linuxserver.io stack)**: `pid:` share **невозможен** — s6-overlay требует PID 1. Workaround: только `ipc:` + race window connect. См. troubleshooting.
+
+Дополнительный target matrix будет в CI после Phase 4 (см. [ROADMAP.md](../ROADMAP.md)).
@@ -0,0 +1,326 @@
+# Troubleshooting
+
+Реальные грабли которые мы прошли при первой production deployment'е cuframes
+(Frigate + custom C++ processor + custom Python). Документировано чтобы вы их
+не повторяли.
+
+## Содержание
+
+- [Runtime / CUDA IPC](#runtime--cuda-ipc)
+  - [`cudaIpcOpenEventHandle: invalid device context`](#cudaipcopeneventhandle-invalid-device-context)
+  - [Subscriber timeout (`cuframes_subscriber_create: timeout`)](#subscriber-timeout)
+  - [Permission denied на socket](#permission-denied-на-socket)
+- [Frigate-specific](#frigate-specific)
+  - [`s6-overlay-suexec: fatal: can only run as pid 1`](#s6-overlay-suexec-fatal-can-only-run-as-pid-1)
+  - [`No such filter: 'scale_cuda'`](#no-such-filter-scale_cuda)
+  - [Missing dynamic .so после ffmpeg replace](#missing-dynamic-so-после-ffmpeg-replace)
+- [Build / FFmpeg patch](#build--ffmpeg-patch)
+  - [`libcuframes not found` при configure](#libcuframes-not-found-при-configure)
+  - [`ffbuild/library.mak: No such file`](#ffbuildlibrarymak-no-such-file)
+  - [`could not find a working compiler` (GMP)](#could-not-find-a-working-compiler-gmp)
+  - [`zlib: download failed` в crosstool-NG](#zlib-download-failed-в-crosstool-ng)
+  - [`stdbit.h: No such file` при `--enable-cuda-llvm`](#stdbith-no-such-file-при---enable-cuda-llvm)
+- [Docker / IPC](#docker--ipc)
+  - [Cross-container CUDA IPC: ipc + pid namespace share](#cross-container-cuda-ipc-ipc--pid-namespace-share)
+  - [Buildx container driver не видит host images](#buildx-container-driver-не-видит-host-images)
+- [Networking / RTSP](#networking--rtsp)
+  - [RTSP/RTP UDP не доходит до клиента (docker NAT)](#rtsprtp-udp-не-доходит-до-клиента-docker-nat)
+  - [`Nonmatching transport in server reply`](#nonmatching-transport-in-server-reply)
+- [Gitea Actions / CI](#gitea-actions--ci)
+  - [`node: executable file not found`](#node-executable-file-not-found)
+  - [`SyntaxError: Unexpected token '{'` (Node 12)](#syntaxerror-unexpected-token--node-12)
+
+---
+
+## Runtime / CUDA IPC
+
+### `cudaIpcOpenEventHandle: invalid device context`
+
+**Симптом**: subscriber сразу после `cuframes_subscriber_create` падает с этой ошибкой.
+
+**Причина**: CUDA driver проверяет IPC peer через `/proc/<pid>/...`. Если процесс publisher'а **не виден** в PID namespace consumer'а — context считается невалидным.
+
+**Fix**: shared PID namespace.
+
+Docker:
+```yaml
+consumer:
+  ipc: "container:<publisher>"   # shared /dev/shm
+  pid: "container:<publisher>"   # ← вот это критично, без него fail
+```
+
+Host process: запуск consumer'а на host'е (либо publisher'а на host'е тоже) — same default namespace.
+
+**Caveat**: если consumer image использует s6-overlay (Frigate, linuxserver.io
+images) — `pid: container:` несовместим (см. [соответствующую секцию](#s6-overlay-suexec-fatal-can-only-run-as-pid-1)).
+
+### Subscriber timeout
+
+**Симптом**: `cuframes_subscriber_create: timeout` без других ошибок.
+
+**Причины** (в порядке вероятности):
+1. `/run/cuframes/<key>.sock` не виден consumer'у — забыли volume-mount
+2. `/run/cuframes` смонтирован, но publisher ещё не успел создать socket — увеличить `connect_timeout_ms`
+3. Publisher запущен, socket есть, но **permission denied** — см. ниже
+
+### Permission denied на socket
+
+**Симптом**: socket виден через `ls -la /run/cuframes/`, owner `root`. Consumer process — non-root user → не может `connect()`.
+
+**Fix**:
+- Запустить consumer как root: `user: root` в compose
+- Либо изменить permissions socket после создания (publisher delegation) — TBD в v0.2
+
+---
+
+## Frigate-specific
+
+### `s6-overlay-suexec: fatal: can only run as pid 1`
+
+**Симптом**: container Frigate'а в restart loop, в logs только эта ошибка.
+
+**Причина**: `pid: container:<publisher>` сделал Frigate not-PID-1 в shared namespace. s6-overlay v3 strictly требует PID 1 для proper signal handling/zombie reaping.
+
+**Fix**: убрать `pid: container:` для Frigate. Только `ipc: container:` shared.
+
+**Trade-off**: без shared pid некоторые edge cases CUDA IPC ломаются (см. [соответствующую секцию](#cudaipcopeneventhandle-invalid-device-context)). Frigate **на практике** работает потому что подключается до того как CUDA driver проверяет peer (race window race), но если publisher restart'нётся посередине — Frigate'у не удастся пере-подключиться без перезапуска.
+
+**Real fix** (planned v0.2): encoded packet sharing — Frigate detect получает кадры через decoded path (work-around), record получает encoded через socket-based protocol который **не** требует cudaIpcOpenEventHandle.
+
+### `No such filter: 'scale_cuda'`
+
+**Симптом**: Frigate ffmpeg subprocess падает с этой ошибкой в `AVFilterGraph`.
+
+**Причина**: наш patched FFmpeg собран без `--enable-cuda-llvm` (см. [stdbit.h grабля](#stdbith-no-such-file-при---enable-cuda-llvm)). Без cuda-llvm в FFmpeg нет CUDA filters (scale_cuda, overlay_cuda).
+
+**Fix**: в Frigate config.yml явно отключи hwaccel cuda:
+```yaml
+ffmpeg:
+  hwaccel_args: []   # CPU scale вместо scale_cuda
+```
+
+Cost: 5-10% CPU per FHD25 камера. **Real fix** (v0.2): publisher-side resize в cuframes сам.
+
+### Missing dynamic .so после ffmpeg replace
+
+**Симптом**: после `docker cp` patched ffmpeg в Frigate container — `ldd ffmpeg`
+показывает `libharfbuzz.so.0 => not found`, `libfribidi.so.0 => not found`, …
+~20 missing .so.
+
+**Причина**: Frigate's bundled ffmpeg **статически слинкован** (NickM-27/FFmpeg-Builds
+делает full static build). Все 30+ deps встроены в один binary. Frigate runtime
+image **не имеет** этих .so packages installed (ему не надо — bundled ffmpeg
+self-contained).
+
+Наш custom ffmpeg — **dynamic linked** (apt deps). Нужны .so на target.
+
+**Fix**: либо
+- `apt install` missing libs в Frigate (additive image modification):
+  ```bash
+  apt install libharfbuzz0b libfribidi0 librist4 libsrt1.5-openssl libssh-4 \
+              libvpx7 libwebpmux3 libwebp7 libdav1d6 libaom3 libmp3lame0 \
+              libsvtav1enc1 libtheora0 libvorbis0a libvorbisenc2 \
+              libx264-164 libx265-199 libopus0
+  ```
+- Либо строить наш ffmpeg static (sources from NickM-27 pipeline) — complex
+  (см. [zlib download / GMP compiler граблю](#zlib-download-failed-в-crosstool-ng))
+
+Best practice: создать `Dockerfile.frigate` overlay поверх Frigate image,
+который добавляет deps и копирует ffmpeg. Запечь в image, не in-place patch.
+
+---
+
+## Build / FFmpeg patch
+
+### `libcuframes not found` при configure
+
+**Симптом**: FFmpeg configure (с `--enable-libcuframes`) fails с этой ошибкой
+из `enabled libcuframes && require libcuframes ...`. config.log показывает
+`fatal error: cuframes/cuframes.h: No such file or directory`.
+
+**Причины**:
+
+1. **CMake install rules отсутствовали** в libcuframes (early commits до 601806a).
+   `cmake --install` создавал пустой prefix. Fix: обновить cuframes до ≥ 601806a.
+
+2. **Wrong HINTS в find_library**: твой проект ищет в `${CUFRAMES_ROOT}/build/...`
+   но install layout кладёт в `${CUFRAMES_ROOT}/lib`. Добавь оба пути в HINTS.
+
+3. **`rm -f libcuframes.so*`** удалил .so но **.a** file называется
+   `libcuframes_static.a` (не `libcuframes.a`) → linker не находит `-lcuframes`.
+   Fix: либо не удаляй .so, либо переименуй .a при install.
+
+### `ffbuild/library.mak: No such file`
+
+**Симптом**: configure FFmpeg success, но `make` падает сразу:
+`Makefile:123: ffbuild/library.mak: No such file or directory`.
+
+**Причина**: вы сделали ваш fork FFmpeg через snapshot (не git clone), и **случайно
+исключили `ffbuild/`** в rsync. Это **source files** FFmpeg, не build artifacts.
+
+**Fix**: убедись что `ffbuild/` есть в твоём FFmpeg checkout (`ls ffbuild/library.mak`).
+Если делаешь snapshot через rsync — не используй `--exclude=ffbuild`.
+
+### `could not find a working compiler` (GMP)
+
+**Симптом**: crosstool-NG build падает на `Installing GMP for host` с
+`configure: error: could not find a working compiler`. config.log показывает
+`no, long long reliability test 1`.
+
+**Причина**: GMP 6.2.1 имеет known issue с GCC 11+ (Ubuntu 22.04 default).
+Проверка long-long reliability fail'ит false-positive.
+
+**Fix**: pin GMP к 6.3.0 в `ct-ng-config`:
+```
+CT_GMP_V_6_3=y
+# CT_GMP_V_6_2 is not set
+CT_GMP_VERSION="6.3.0"
+```
+
+И убедись что crosstool-NG version (commit) поддерживает 6.3.0 (≥ master 2024-09).
+
+### `zlib: download failed` в crosstool-NG
+
+**Симптом**: crosstool-NG step `Retrieving 'zlib-1.2.12'` fail'ит.
+
+**Причина**: zlib.net убрали старые versions с дефолтного location — теперь они
+только в `/fossils/` subdirectory. Crosstool-NG hardcoded URL не работает.
+
+**Fix**: pre-fetch tarball + положить в local cache:
+```bash
+wget https://zlib.net/fossils/zlib-1.2.12.tar.gz -O preload/zlib-1.2.12.tar.gz
+```
+
+В Dockerfile перед `ct-ng build`:
+```dockerfile
+COPY preload/*.tar.gz /root/src/
+```
+
+`CT_LOCAL_TARBALLS_DIR=${HOME}/src` — crosstool-NG найдёт в cache и не пойдёт
+download.
+
+### `stdbit.h: No such file` при `--enable-cuda-llvm`
+
+**Симптом**: FFmpeg configure с `--enable-cuda-llvm` fail'ит:
+`fatal error: stdbit.h: No such file or directory`. ERROR: cuda_llvm requested
+but not found.
+
+**Причина**: `stdbit.h` — C23 standard header. Доступен в glibc ≥ 2.38.
+
+- Ubuntu 22.04 = glibc 2.35 — **нет**
+- Debian 12 = glibc 2.36 — **нет**
+- Ubuntu 24.04 = glibc 2.39 — есть
+- Debian 13 (trixie) = glibc 2.38+ — есть
+
+**Fix options**:
+1. Build на newer base (Ubuntu 24.04+). Но runtime target (Frigate Debian 12)
+   не запустит binary с glibc-2.38 symbols (backwards-incompatible).
+2. Убрать `--enable-cuda-llvm`. Потеря: CUDA filters (`scale_cuda`, `overlay_cuda`,
+   `hwupload_cuda`). Decode/encode через NVDEC/NVENC всё равно работают.
+3. Дождаться когда Frigate base обновится до newer Debian — вне твоего контроля.
+
+**На практике**: убираем cuda-llvm, в Frigate config `hwaccel_args: []`.
+См. [scale_cuda секцию](#no-such-filter-scale_cuda).
+
+---
+
+## Docker / IPC
+
+### Cross-container CUDA IPC: ipc + pid namespace share
+
+| Что нужно | Compose option |
+|---|---|
+| /dev/shm shared (для cuframes header + SHM ring) | `ipc: container:<publisher>` (либо `ipc: shareable` у publisher + same у consumer) |
+| /proc visibility (для CUDA IPC peer validation) | `pid: container:<publisher>` |
+| `/run/cuframes/*.sock` доступен | volume mount: `cuframes_sock:/run/cuframes:ro` |
+| GPU access | `runtime: nvidia` |
+| Socket permissions | `user: root` (либо chmod socket в publisher) |
+
+**Все 5** должны быть выполнены. Один пропуск — fail при subscriber_create или
+cudaIpcOpenEventHandle.
+
+### Buildx container driver не видит host images
+
+**Симптом**: при использовании custom buildx builder (`docker buildx create
+--driver docker-container ...`) с `FROM local-image:tag` — error `failed to
+authorize: 403 Forbidden` (buildkit пытается pull с registry).
+
+**Причина**: container driver buildx изолирован, не имеет доступа к host's
+local docker daemon images. Pull через registry.
+
+**Fix**: либо
+- Не использовать custom builder — `docker buildx use default` (использует host
+  daemon). Минус: теряем `--cache-to/--cache-from type=local`.
+- Либо push local image в **registry** (local или gitea), и buildx pull'ит оттуда.
+
+---
+
+## Networking / RTSP
+
+### RTSP/RTP UDP не доходит до клиента (docker NAT)
+
+**Симптом**: RTSP server в docker контейнере с `ports: "554:8555"`. Клиент (TV, VLC)
+делает RTSP SETUP successfully (TCP control работает), но video frames не приходят.
+
+**Причина**: RTP идёт **UDP**, sourced из docker network namespace. SNAT MASQUERADE
+для outbound работает, но RTP destination port (которое клиент опубликовал в SETUP)
+**не маппится обратно** через docker bridge — клиент видит UDP packets от чужого
+source IP (docker network 172.x), не от 192.168.88.23 как expected.
+
+**Fix**: `network_mode: host` для RTSP-server контейнера. Тогда server listens
+**напрямую** на host interfaces, RTP packets идут без NAT.
+
+Trade-offs:
+- Все ports app'а listen на host network (нет port mapping). Проверь port collisions.
+- DB env vars (postgres:5432 в docker network DNS) надо менять на host paths
+  (`localhost:5433` если postgres exposed на host port 5433).
+
+### `Nonmatching transport in server reply`
+
+**Симптом**: `ffprobe -rtsp_transport tcp -i rtsp://...` falls с этим сообщением.
+
+**Причина**: RTSP server возвращает SDP с UDP-only transport. Client ожидает TCP
+interleaved.
+
+**Fix**: использовать UDP transport: `-rtsp_transport udp` (либо default behavior).
+Если TV не поддерживает UDP — нужен RTSP server который умеет RTP-over-TCP
+interleaved (cctv-processor v0.1 не умеет).
+
+---
+
+## Gitea Actions / CI
+
+### `node: executable file not found`
+
+**Симптом**: первый JS action (например `actions/checkout@v4`) fail'ит:
+`OCI runtime exec failed: exec: "node": executable file not found in $PATH`.
+
+**Причина**: гитея act_runner запускает JS actions через `node`, но твой
+custom container (например `nvidia/cuda:...`) не имеет node installed.
+
+**Fix**: pre-install node в первом `run:` step (до actions/checkout):
+```yaml
+steps:
+  - name: Bootstrap node
+    run: apt-get update && apt-get install -y nodejs git ca-certificates
+  - name: Checkout
+    uses: actions/checkout@v4
+```
+
+Либо использовать container с node pre-installed (`docker.gitea.com/runner-images:ubuntu-22.04`).
+
+### `SyntaxError: Unexpected token '{'` (Node 12)
+
+**Симптом**: после `apt install nodejs` в Ubuntu 22.04 — actions/checkout@v4 fail'ит:
+`SyntaxError: Unexpected token '{' at static {...}`.
+
+**Причина**: Ubuntu 22.04 apt'овский `nodejs` = Node **12**. `actions/checkout@v4`
+скомпилирован для Node 20+ (static class blocks — ES2022).
+
+**Fix**: install Node 20 from NodeSource:
+```bash
+curl -fsSL https://deb.nodesource.com/setup_20.x | bash -
+apt-get install -y nodejs
+```
+
+В Ubuntu 24.04 apt уже даёт Node 20 — там goes автоматически.
@@ -0,0 +1,8 @@
+# Скопировать в .env (не commit'ить!)
+# .env должен быть в .gitignore
+
+# Камеры: пароли admin user'а на Dahua/Hikvision/etc
+CAM_PARKING_PASS=changeme
+
+# Frigate API/UI auth password
+FRIGATE_RTSP_PASSWORD=changeme
@@ -0,0 +1,61 @@
+# examples/frigate-compose
+
+Reference docker-compose для Frigate + cuframes integration. **НЕ** копировать
+в production бездумно — это шаблон, адаптируй под свою инфру (IP-адреса камер,
+пароли, mount paths, network).
+
+## Quickstart
+
+1. Build patched Frigate image (single-time setup, ~15 мин):
+   ```bash
+   # См. docs/integrations/frigate.md, Шаг 1 — там полный Dockerfile.
+   docker build -t local/frigate-cuframes:latest -f Dockerfile.frigate .
+   ```
+
+2. Pull cuframes publisher image:
+   ```bash
+   docker pull git.goldix.org/gx/cuframes:0.1
+   # либо собрать local: docker build -t local/cuframes:0.1 -f docker/Dockerfile.runtime ../..
+   ```
+
+3. Скопировать .env:
+   ```bash
+   cp .env.example .env
+   $EDITOR .env   # подставь свои camera passwords
+   ```
+
+4. Адаптировать `docker-compose.yml`:
+   - `parking-cam-ip` → реальный IP камеры
+   - `--key cam-parking` → имя по вкусу (должно matche'ить config.yml `cuframes://<key>`)
+   - `cam-parking` в Frigate config → так же matched
+
+5. Адаптировать `config/config.yml`:
+   - детектор (cpu / onnx / tensorrt)
+   - пути к media
+   - дополнительные камеры если нужно
+
+6. Run:
+   ```bash
+   docker compose up -d
+   docker logs -f frigate
+   # UI: http://localhost:5000 (internal) либо https://localhost:8971 (auth)
+   ```
+
+## Что демонстрирует
+
+- Один publisher (`cuframes-pub-parking`) делает 1× NVDEC на parking-камеру
+- Frigate подключается к publisher через `ipc:container:` + `cuframes://` URL
+- Frigate **не** делает свой NVDEC для detect-path — берёт готовые NV12 frames
+
+## Что НЕ демонстрирует
+
+- Record path — Frigate всё ещё открывает второй RTSP к камере (для архива
+  `-c:v copy` mux). v0.2 cuframes решит через encoded packet sharing
+  (см. [issue #2](https://git.goldix.org/gx/cuframes/issues/2))
+- Multi-camera setup — добавь больше publisher'ов и camera-blocks в config.yml
+- HA/MQTT интеграция — добавь свой mqtt block
+
+## См. также
+
+- [docs/integrations/frigate.md](../../docs/integrations/frigate.md) — полный walkthrough
+- [docs/integration.md](../../docs/integration.md) — общая интеграция
@@ -0,0 +1,49 @@
+# Minimal Frigate config с cuframes integration.
+# Полный guide: docs/integrations/frigate.md
+
+mqtt:
+  enabled: false
+
+detectors:
+  # Замени на свой detector (tensorrt / onnx / cpu). Здесь — placeholder.
+  cpu:
+    type: cpu
+
+# CRITICAL: hwaccel cuda отключён — наш patched ffmpeg без --enable-cuda-llvm
+# (не работает на glibc < 2.38 что у Debian 12, на котором Frigate runtime).
+# Без cuda-llvm нет scale_cuda filter. Detect-path использует CPU scale, но
+# decode уже сделан у publisher'а — net выигрыш всё равно.
+ffmpeg:
+  hwaccel_args: []
+  output_args:
+    record: preset-record-generic-audio-aac
+
+cameras:
+  parking_overview:
+    enabled: true
+    friendly_name: Парковка
+    ffmpeg:
+      inputs:
+        # main (full-res) — только запись в архив через прямой RTSP (`-c:v copy`, no decode у Frigate)
+        # После cuframes v0.2 этот path тоже может через cuframes_packets:// (encoded share)
+        - path: rtsp://admin:${FRIGATE_RTSP_PASSWORD}@parking-cam-ip:554/cam/realmonitor?channel=1&subtype=0
+          roles: [record]
+
+        # sub-stream → через cuframes (decoded у publisher'а, без второго NVDEC)
+        - path: cuframes://cam-parking
+          input_args: -f cuframes
+          roles: [detect]
+    detect:
+      width: 640
+      height: 480
+      fps: 5
+
+record:
+  enabled: true
+  retain:
+    days: 7
+
+snapshots:
+  enabled: true
+  retain:
+    default: 7
@@ -0,0 +1,73 @@
+# Reference docker-compose для Frigate + cuframes integration.
+# Полный guide: docs/integrations/frigate.md
+#
+# Что нужно подготовить заранее:
+#   1. Build local image local/frigate-cuframes:latest по Dockerfile.frigate
+#      (см. docs/integrations/frigate.md, Шаг 1)
+#   2. Pull cuframes runtime image:
+#      docker pull git.goldix.org/gx/cuframes:0.1   # либо собрать local
+#   3. Скопировать config/config.yml (placeholder в config/ рядом)
+#   4. .env с CAM_PARKING_PASS=... и FRIGATE_RTSP_PASSWORD=...
+#
+# Запуск:
+#   docker compose up -d
+#   # UI: http://host:5000 (internal, без auth) либо https://host:8971 (with auth)
+
+services:
+  # 1× publisher на камеру — single source of RTSP + NVDEC
+  cuframes-pub-parking:
+    image: git.goldix.org/gx/cuframes:0.1
+    container_name: cuframes-pub-parking
+    restart: unless-stopped
+    runtime: nvidia
+    ipc: shareable
+    shm_size: 256m
+    environment:
+      NVIDIA_VISIBLE_DEVICES: all
+      NVIDIA_DRIVER_CAPABILITIES: compute,video,utility
+    volumes:
+      - cuframes_sock:/run/cuframes
+    command:
+      - /usr/local/bin/cuframes-rtsp-source
+      - --rtsp
+      # Используем sub-stream для detect-path (lighter resolution, тот же camera load)
+      - "rtsp://admin:${CAM_PARKING_PASS}@parking-cam-ip:554/cam/realmonitor?channel=1&subtype=1"
+      - --key
+      - cam-parking
+      - --ring
+      - "6"
+      - --verbose
+
+  frigate:
+    image: local/frigate-cuframes:latest   # см. docs/integrations/frigate.md Шаг 1
+    container_name: frigate
+    restart: unless-stopped
+    depends_on:
+      cuframes-pub-parking:
+        condition: service_started
+    runtime: nvidia
+    privileged: true
+    shm_size: 512m
+    # WARN: только ipc share — pid НЕ shared (Frigate's s6-overlay требует PID 1).
+    # Frigate подсоединяется к first CUDA context publisher'а в shared /dev/shm.
+    ipc: "container:cuframes-pub-parking"
+    environment:
+      FRIGATE_RTSP_PASSWORD: "${FRIGATE_RTSP_PASSWORD}"
+      NVIDIA_VISIBLE_DEVICES: all
+      NVIDIA_DRIVER_CAPABILITIES: compute,video,utility
+    ports:
+      - "5000:5000"        # UI без auth (internal, не expose external!)
+      - "8971:8971"        # UI с HTTPS + auth
+      - "8554:8554"        # RTSP restream (go2rtc)
+      - "8555:8555/tcp"
+      - "8555:8555/udp"
+    volumes:
+      - cuframes_sock:/run/cuframes:ro
+      - ./config/config.yml:/config/config.yml:ro
+      - ./media:/media/frigate
+      - type: tmpfs
+        target: /tmp/cache
+        tmpfs: { size: 1000000000 }
+
+volumes:
+  cuframes_sock:
@@ -0,0 +1,78 @@
+# examples/python-consumer
+
+Reference Python consumer для cuframes через `ctypes` wrapper.
+
+## Use case
+
+AI/ML pipeline (PyTorch / ONNX / TensorRT) которому нужны декодированные кадры
+с камер. Без cuframes — каждый Python скрипт открывает RTSP + decode сам.
+С cuframes — подписывается на готовые NV12 frames от publisher'а.
+
+## Запуск
+
+```bash
+# Publisher должен быть запущен (см. tools/cuframes-rtsp-source или Docker image)
+cuframes-rtsp-source --rtsp rtsp://admin:pw@cam-ip:554/... --key cam-parking &
+
+# Consumer (same host, либо same docker namespace — см. требования ниже)
+python3 cuframes_consumer.py --key cam-parking --max-frames 100
+```
+
+Ожидаемый output:
+```
+[consumer] connected to 'cam-parking'
+[consumer] first frame: 640x480 NV12, pitch_y=640, pitch_uv=640, cuda_ptr=0x...
+[consumer] received=25 seq=42 pts_ms=...
+...
+=== RESULT ===
+received: 100 / 100
+elapsed: 3.96s
+avg_fps: 25.03
+```
+
+## Что этот пример НЕ делает
+
+- **НЕ копирует** GPU NV12 frame на host — `cuda_ptr` это raw CUDA device pointer.
+  Для реальной работы нужно:
+  - `pycuda` / `cupy` / `cuda-python` библиотека для CUDA memcpy
+  - либо передать `cuda_ptr` напрямую в GPU-aware ML framework (PyTorch's
+    `torch.cuda.IntTensor.from_dlpack` etc.)
+
+- **НЕ конвертирует** NV12 → RGB. Используй `cv2.cvtColor(nv12, cv2.COLOR_YUV2RGB_NV12)`
+  на host или GPU-side conversion.
+
+- **НЕ обрабатывает** inference — это skeleton, в твоём pipeline replace
+  comment-block `### ВАШ ML PIPELINE ЗДЕСЬ ###` с актуальным кодом.
+
+## Требования
+
+| | Значение |
+|---|---|
+| Python | 3.8+ |
+| `libcuframes.so.0` | в `LD_LIBRARY_PATH` (либо `/usr/local/lib`) |
+| Publisher running | да, с matching `--key` |
+| Same IPC namespace | да (host либо `ipc:container:<publisher>` в docker) |
+| Same PID namespace | да (host либо `pid:container:<publisher>` в docker) |
+| NVIDIA GPU + driver | для access `cuda_ptr` (read-only frame от publisher'а) |
+
+## Docker-style
+
+```yaml
+# В compose рядом с publisher service
+ai-pipeline:
+  image: your-ai-image:cuda
+  runtime: nvidia
+  ipc: "container:cuframes-pub-parking"
+  pid: "container:cuframes-pub-parking"
+  volumes:
+    - cuframes_sock:/run/cuframes:ro
+  environment:
+    LD_LIBRARY_PATH: /usr/local/lib
+  command: python3 /app/cuframes_consumer.py --key cam-parking --max-frames 1000000
+```
+
+## v0.3 → first-class pybind11 bindings
+
+Текущий ctypes pattern будет заменён на native pybind11 bindings в v0.3 cuframes
+([ROADMAP.md](../../ROADMAP.md)). Тогда API будет более pythonic + zero-copy через
+`__cuda_array_interface__` / `dlpack`.
@@ -0,0 +1,206 @@
+#!/usr/bin/env python3
+"""
+Reference Python consumer для cuframes (через ctypes wrapper).
+
+До v0.3 (когда появятся первоклассные pybind11 bindings) — это minimal
+working pattern для AI/ML скриптов которые хотят подписаться на cuframes IPC.
+
+Pattern:
+  1. subscribe to cuframes (open libcuframes.so via ctypes)
+  2. в цикле: получить next() frame
+  3. cudaMemcpy → host (через pycuda либо отдельной CUDA-Python библиотекой)
+  4. передать в свой ML pipeline (ONNX/TensorRT/PyTorch)
+  5. release frame обратно publisher'у
+
+Limitations:
+  - Этот skeleton НЕ делает actual CUDA copy (нужна pycuda / cupy / cuda-python)
+  - Только sync API
+  - Только NV12 (v0.1)
+
+Запуск:
+  python3 cuframes_consumer.py --key cam-parking --max-frames 100
+
+Требования (на target host):
+  - libcuframes.so в LD_LIBRARY_PATH (либо apt install / docker)
+  - publisher запущен (cuframes-rtsp-source --key cam-parking ...)
+  - same IPC + PID namespace что publisher (если в docker — ipc:container: + pid:container:)
+"""
+
+import argparse
+import ctypes
+import sys
+import time
+from ctypes import c_int, c_int32, c_int64, c_uint64, c_uint32, c_char_p, c_void_p, c_size_t, POINTER, Structure
+
+
+# ─── C API bindings ─────────────────────────────────────────────────────
+
+# Error codes
+CUFRAMES_OK = 0
+CUFRAMES_ERR_TIMEOUT = -7
+CUFRAMES_ERR_WOULD_BLOCK = -11
+CUFRAMES_ERR_DISCONNECTED = -9
+
+# Modes
+CUFRAMES_MODE_NEWEST_ONLY = 0
+CUFRAMES_MODE_STRICT_ORDER = 1
+
+# Pixel format
+CUFRAMES_FORMAT_NV12 = 0
+
+
+class SubscriberConfig(Structure):
+    """Соответствует C struct cuframes_subscriber_config."""
+    _fields_ = [
+        ("key", c_char_p),
+        ("consumer_name", c_char_p),
+        ("mode", c_int),
+        ("cuda_device", c_int32),
+        ("connect_timeout_ms", c_int32),
+        ("_reserved", c_uint64 * 4),
+    ]
+
+
+def _load_libcuframes():
+    """Загрузить libcuframes.so + bind ctypes signatures."""
+    try:
+        lib = ctypes.CDLL("libcuframes.so.0")
+    except OSError as e:
+        sys.stderr.write(f"Cannot load libcuframes.so.0: {e}\n")
+        sys.stderr.write("Установи libcuframes (см. cuframes README) и убедись что .so в LD_LIBRARY_PATH.\n")
+        sys.exit(1)
+
+    # cuframes_strerror
+    lib.cuframes_strerror.argtypes = [c_int]
+    lib.cuframes_strerror.restype = c_char_p
+
+    # cuframes_subscriber_create
+    lib.cuframes_subscriber_create.argtypes = [POINTER(SubscriberConfig), POINTER(c_void_p)]
+    lib.cuframes_subscriber_create.restype = c_int
+
+    # cuframes_subscriber_next  (consumer_stream=NULL — sync API, default stream)
+    lib.cuframes_subscriber_next.argtypes = [c_void_p, c_void_p, POINTER(c_void_p), c_int32]
+    lib.cuframes_subscriber_next.restype = c_int
+
+    # cuframes_subscriber_release
+    lib.cuframes_subscriber_release.argtypes = [c_void_p, c_void_p]
+    lib.cuframes_subscriber_release.restype = c_int
+
+    # cuframes_subscriber_destroy
+    lib.cuframes_subscriber_destroy.argtypes = [c_void_p]
+    lib.cuframes_subscriber_destroy.restype = c_int
+
+    # cuframes_frame_* accessors
+    lib.cuframes_frame_cuda_ptr.argtypes = [c_void_p]
+    lib.cuframes_frame_cuda_ptr.restype = c_void_p
+
+    lib.cuframes_frame_size.argtypes = [c_void_p, POINTER(c_int32), POINTER(c_int32)]
+    lib.cuframes_frame_size.restype = None
+
+    lib.cuframes_frame_pitch_y.argtypes = [c_void_p]
+    lib.cuframes_frame_pitch_y.restype = c_int32
+
+    lib.cuframes_frame_pitch_uv.argtypes = [c_void_p]
+    lib.cuframes_frame_pitch_uv.restype = c_int32
+
+    lib.cuframes_frame_seq.argtypes = [c_void_p]
+    lib.cuframes_frame_seq.restype = c_uint64
+
+    lib.cuframes_frame_pts_ns.argtypes = [c_void_p]
+    lib.cuframes_frame_pts_ns.restype = c_int64
+
+    return lib
+
+
+# ─── Main consumer loop ────────────────────────────────────────────────
+
+def main():
+    ap = argparse.ArgumentParser(description="Reference cuframes Python consumer")
+    ap.add_argument("--key", required=True, help="publisher key (e.g. cam-parking)")
+    ap.add_argument("--max-frames", type=int, default=100, help="N frames to receive (default 100)")
+    ap.add_argument("--cuda-device", type=int, default=0)
+    ap.add_argument("--timeout-ms", type=int, default=1000, help="per-frame timeout")
+    args = ap.parse_args()
+
+    lib = _load_libcuframes()
+
+    # Configure subscriber
+    cfg = SubscriberConfig()
+    cfg.key = args.key.encode("utf-8")
+    cfg.consumer_name = None  # auto-generated
+    cfg.mode = CUFRAMES_MODE_NEWEST_ONLY
+    cfg.cuda_device = args.cuda_device
+    cfg.connect_timeout_ms = 5000
+
+    sub_handle = c_void_p()
+    rc = lib.cuframes_subscriber_create(ctypes.byref(cfg), ctypes.byref(sub_handle))
+    if rc != CUFRAMES_OK:
+        sys.stderr.write(f"subscribe failed: {lib.cuframes_strerror(rc).decode()}\n")
+        sys.exit(1)
+
+    print(f"[consumer] connected to '{args.key}'")
+
+    received = 0
+    first_pts = None
+    start_wall = None
+
+    try:
+        while received < args.max_frames:
+            frame_handle = c_void_p()
+            rc = lib.cuframes_subscriber_next(sub_handle, None, ctypes.byref(frame_handle),
+                                              args.timeout_ms)
+
+            if rc == CUFRAMES_ERR_TIMEOUT or rc == CUFRAMES_ERR_WOULD_BLOCK:
+                continue
+            if rc == CUFRAMES_ERR_DISCONNECTED:
+                print(f"[consumer] publisher disconnected — exit")
+                break
+            if rc != CUFRAMES_OK or not frame_handle.value:
+                sys.stderr.write(f"next failed: {lib.cuframes_strerror(rc).decode()}\n")
+                break
+
+            # Frame metadata
+            w, h = c_int32(0), c_int32(0)
+            lib.cuframes_frame_size(frame_handle, ctypes.byref(w), ctypes.byref(h))
+            pitch_y = lib.cuframes_frame_pitch_y(frame_handle)
+            pitch_uv = lib.cuframes_frame_pitch_uv(frame_handle)
+            cuda_ptr = lib.cuframes_frame_cuda_ptr(frame_handle)
+            seq = lib.cuframes_frame_seq(frame_handle)
+            pts_ns = lib.cuframes_frame_pts_ns(frame_handle)
+
+            if first_pts is None:
+                first_pts = pts_ns
+                start_wall = time.monotonic()
+                print(f"[consumer] first frame: {w.value}x{h.value} NV12, "
+                      f"pitch_y={pitch_y}, pitch_uv={pitch_uv}, cuda_ptr=0x{cuda_ptr:x}")
+
+            # ─── ВАШ ML PIPELINE ЗДЕСЬ ────────────────────────────
+            # 1. cudaMemcpy NV12 frame → host (или используй pycuda / cupy для in-GPU pipeline)
+            # 2. NV12 → RGB conversion (CPU либо GPU)
+            # 3. inference: model(frame) → results
+            # 4. publish results (mqtt / API / etc)
+            #
+            # В этом skeleton — просто counter.
+            received += 1
+            if received % 25 == 0:
+                print(f"[consumer] received={received} seq={seq} pts_ms={pts_ns // 1_000_000}")
+
+            # CRITICAL: release frame ОБЯЗАТЕЛЬНО — иначе publisher застрянет
+            # (или drop new frames при ring overflow в STRICT_ORDER mode).
+            lib.cuframes_subscriber_release(sub_handle, frame_handle)
+
+    finally:
+        lib.cuframes_subscriber_destroy(sub_handle)
+
+    if received > 1 and start_wall:
+        elapsed = time.monotonic() - start_wall
+        fps = (received - 1) / elapsed if elapsed > 0 else 0
+        print(f"\n=== RESULT ===")
+        print(f"received: {received} / {args.max_frames}")
+        print(f"elapsed:  {elapsed:.2f}s")
+        print(f"avg_fps:  {fps:.2f}")
+    sys.exit(0 if received >= args.max_frames else 1)
+
+
+if __name__ == "__main__":
+    main()
@@ -36,7 +36,7 @@ extern "C" {
 /* ─────────────────────────────────────────────────────────────────────── */

 #define CUFRAMES_VERSION_MAJOR 0
-#define CUFRAMES_VERSION_MINOR 1
+#define CUFRAMES_VERSION_MINOR 3
 #define CUFRAMES_VERSION_PATCH 0

 /** @brief Runtime-версия библиотеки в формате "MAJOR.MINOR.PATCH". */
@@ -65,6 +65,11 @@ typedef enum cuframes_error {
                                                 несовпадение размеров frame'а */
    CUFRAMES_ERR_WOULD_BLOCK     =  -11,  /**< non-blocking call — no data yet */
    CUFRAMES_ERR_TOO_MANY        =  -12,  /**< превышен MAX_SUBSCRIBERS (32) */
+    /* v0.2 — packet ring (см. docs/protocol.md §10.15) */
+    CUFRAMES_ERR_PACKET_OVERSIZED  =  -20,  /**< publish_packet size > max_packet_size */
+    CUFRAMES_ERR_NO_PACKET_RING    =  -21,  /**< subscriber запросил packets, у publisher'а нет ring'а */
+    CUFRAMES_ERR_NO_CODEC_PARAMS   =  -22,  /**< extradata ещё не set publisher'ом */
+    CUFRAMES_ERR_PACKET_OVERRUN    =  -23,  /**< slow subscriber, packet seq уехал — resync на keyframe */
    CUFRAMES_ERR_INTERNAL        = -100,  /**< bug в библиотеке — repro и reportить */
 } cuframes_error_t;

@@ -366,6 +371,142 @@ int cuframes_async_subscriber_create(const cuframes_subscriber_config_t *cfg,
 */
 int cuframes_async_subscriber_destroy(cuframes_async_subscriber_t *sub);

+/* ─────────────────────────────────────────────────────────────────────── */
+/* Encoded packet ring API (v0.2 — см. docs/protocol.md §10)                */
+/* ─────────────────────────────────────────────────────────────────────── */
+
+/** Packet flags — биты соответствуют AV_PKT_FLAG_* у FFmpeg. */
+#define CUFRAMES_PKT_FLAG_KEY            0x01u  /**< IDR / keyframe */
+#define CUFRAMES_PKT_FLAG_CORRUPT        0x02u  /**< RTP loss / damage */
+#define CUFRAMES_PKT_FLAG_DISCONTINUITY  0x04u  /**< gap before this packet */
+#define CUFRAMES_PKT_FLAG_LAST_IN_AU     0x08u  /**< последний NAL в access unit */
+
+typedef struct cuframes_packet_ring_options {
+    /** Слотов в индексе ring'а. Default 64 (≈ 2 sec @ 30fps + GOP). */
+    uint32_t ring_slots;
+    /** Размер data section ring'а в байтах. Default 8 MiB. */
+    uint32_t data_size;
+    /** Sanity guard — publisher отклонит packet > этого. Default 2 MiB. */
+    uint32_t max_packet_size;
+    /** FFmpeg AV_CODEC_ID_* (H.264 = 27, HEVC = 173). */
+    uint32_t codec_id;
+    uint64_t _reserved[4];
+} cuframes_packet_ring_options_t;
+
+/**
+ * @brief Активировать encoded packet ring на существующем publisher'е.
+ *
+ * Создаёт дополнительный SHM `/dev/shm/cuframes-<key>-packets`. После
+ * этого call'а publisher шлёт packets через `cuframes_publisher_publish_packet`.
+ *
+ * Должно быть вызвано **до** первого `publish_packet` и желательно до того
+ * как subscribers начнут подключаться (иначе они увидят publisher без packet
+ * ring и не получат packets).
+ *
+ * @param pub
+ * @param opts  NULL = default sizing (64 slots, 8MiB data, 2MiB max). codec_id=0 = unknown.
+ * @return CUFRAMES_ERR_ALREADY_EXISTS если ring уже активирован
+ */
+int cuframes_publisher_enable_packets(cuframes_publisher_t *pub,
+                                       const cuframes_packet_ring_options_t *opts);
+
+/**
+ * @brief Установить codec extradata (SPS/PPS/VPS) для packet ring.
+ *
+ * Subscribers (FFmpeg demuxer) читают extradata из shared header и подставляют
+ * в AVCodecContext.extradata. Должно быть вызвано до того как subscribers
+ * захотят decode.
+ *
+ * @param size  ≤ 4096 байт (CUFRAMES_PKT_EXTRADATA_MAX)
+ */
+int cuframes_publisher_set_codec_extradata(cuframes_publisher_t *pub,
+                                            const void *extradata, size_t size);
+
+/**
+ * @brief Опубликовать encoded packet (H.264/H.265 NAL units, Annex B).
+ *
+ * Slow consumer = overwrite oldest. Late subscriber resync'нется на last
+ * keyframe (см. docs/protocol.md §10.14).
+ *
+ * @param flags  CUFRAMES_PKT_FLAG_* (минимум KEY на IDR — критично!)
+ * @return CUFRAMES_ERR_NO_PACKET_RING если не вызывали enable_packets
+ * @return CUFRAMES_ERR_PACKET_OVERSIZED если size > max_packet_size
+ */
+int cuframes_publisher_publish_packet(cuframes_publisher_t *pub,
+                                       const void *data, size_t size,
+                                       int64_t pts_ns, int64_t dts_ns,
+                                       uint32_t flags);
+
+/* ── Subscriber-side packet API ───────────────────────────────────────── */
+
+/** Opaque packet handle. Освобождается через release_packet. */
+typedef struct cuframes_packet cuframes_packet_t;
+
+/** @brief Pointer на encoded NAL bytes. Valid до release_packet. */
+const void *cuframes_packet_data(const cuframes_packet_t *p);
+
+/** @brief Размер payload в байтах. */
+size_t cuframes_packet_size(const cuframes_packet_t *p);
+
+/** @brief Presentation timestamp (наносекунды). */
+int64_t cuframes_packet_pts(const cuframes_packet_t *p);
+
+/** @brief Decode timestamp (для B-frames pipelines). */
+int64_t cuframes_packet_dts(const cuframes_packet_t *p);
+
+/** @brief Биты CUFRAMES_PKT_FLAG_*. */
+uint32_t cuframes_packet_flags(const cuframes_packet_t *p);
+
+/** @brief Sequence number у publisher'а. */
+uint64_t cuframes_packet_seq(const cuframes_packet_t *p);
+
+/**
+ * @brief Активировать чтение packet ring на subscriber'е.
+ *
+ * Открывает SHM `/dev/shm/cuframes-<key>-packets` (тот же `key` что в config).
+ * После этого можно читать через `cuframes_subscriber_next_packet`.
+ *
+ * Subscriber может одновременно иметь frames ring и packets ring (или один из).
+ *
+ * @return CUFRAMES_ERR_NOT_FOUND если publisher не имеет packet ring
+ */
+int cuframes_subscriber_enable_packets(cuframes_subscriber_t *sub);
+
+/**
+ * @brief Получить следующий packet.
+ *
+ * Late subscriber (первый вызов) начинает с last_keyframe_seq publisher'а —
+ * decoder receive'нет valid stream без glitches.
+ *
+ * Полученный packet ОБЯЗАТЕЛЬНО освободить через
+ * cuframes_subscriber_release_packet().
+ *
+ * @param timeout_ms <0 = блокироваться, 0 = non-blocking (WOULD_BLOCK), >0 = с таймаутом
+ * @return CUFRAMES_ERR_PACKET_OVERRUN — subscriber отстал, resync на keyframe (library сделает автоматически на next call)
+ * @return CUFRAMES_ERR_DISCONNECTED — publisher shutdown
+ */
+int cuframes_subscriber_next_packet(cuframes_subscriber_t *sub,
+                                     cuframes_packet_t **pkt_out,
+                                     int32_t timeout_ms);
+
+/** @brief Освободить packet handle. NULL-safe. */
+int cuframes_subscriber_release_packet(cuframes_subscriber_t *sub,
+                                        cuframes_packet_t *pkt);
+
+/**
+ * @brief Получить codec parameters publisher'а.
+ *
+ * `*extradata_out` — pointer в библиотечный buffer, valid пока subscriber жив.
+ * Caller должен скопировать данные если хочет hold past subscriber lifetime.
+ *
+ * @return CUFRAMES_ERR_NO_CODEC_PARAMS если publisher ещё не вызвал
+ *         set_codec_extradata
+ */
+int cuframes_subscriber_get_codec_params(cuframes_subscriber_t *sub,
+                                          uint32_t *codec_id_out,
+                                          const void **extradata_out,
+                                          size_t *extradata_size_out);
+
 /* ─────────────────────────────────────────────────────────────────────── */
 /* Утилиты                                                                 */
 /* ─────────────────────────────────────────────────────────────────────── */
@@ -148,6 +148,23 @@ public:
              "Publisher::publish_external");
    }

+    /* v0.2 — encoded packet ring */
+    void enable_packets(const cuframes_packet_ring_options_t *opts = nullptr) {
+        check(cuframes_publisher_enable_packets(pub_, opts),
+              "Publisher::enable_packets");
+    }
+
+    void set_codec_extradata(const void *data, size_t size) {
+        check(cuframes_publisher_set_codec_extradata(pub_, data, size),
+              "Publisher::set_codec_extradata");
+    }
+
+    /* Returns CUFRAMES_OK / negative error code (без throw — caller решает). */
+    int publish_packet(const void *data, size_t size,
+                        int64_t pts_ns, int64_t dts_ns, uint32_t flags) noexcept {
+        return cuframes_publisher_publish_packet(pub_, data, size, pts_ns, dts_ns, flags);
+    }
+
    cuframes_publisher_t *raw() noexcept { return pub_; }

 private:
@@ -10,6 +10,7 @@ set(CUFRAMES_SOURCES
    src/producer.c
    src/consumer.c
    src/consumer_async.c
+    src/packet_ring.c
 )

 add_library(cuframes SHARED ${CUFRAMES_SOURCES})
@@ -40,7 +41,7 @@ endforeach()

 # Set SOVERSION на shared lib для ABI tracking
 set_target_properties(cuframes PROPERTIES
-    VERSION 0.1.0
+    VERSION 0.3.0
    SOVERSION 0
 )

@@ -6,6 +6,7 @@
 #include <sys/mman.h>
 #include <sys/socket.h>
 #include <sys/un.h>
+#include <time.h>
 #include <unistd.h>

 /* Opaque frame — выдаётся subscriber'у на next() */
@@ -23,6 +24,17 @@ struct cuframes_frame {
    void *subscriber;        /* back-ref для release() */
 };

+/* Opaque packet handle — single-packet pattern (как frame_obj). */
+struct cuframes_packet {
+    uint8_t *data;          /* heap buffer, allocated by subscriber на enable_packets */
+    size_t   capacity;      /* size of allocation */
+    size_t   size;          /* actual payload size */
+    int64_t  pts_ns;
+    int64_t  dts_ns;
+    uint32_t flags;
+    uint64_t seq;
+};
+
 struct cuframes_subscriber {
    cuframes_subscriber_config_t cfg;
    char key[CUFRAMES_MAX_KEY_LEN + 1];
@@ -32,16 +44,24 @@ struct cuframes_subscriber {
    cuframes_shm_header_t *hdr;
    char shm_name[80];

-    cudaEvent_t producer_event;
+    cudaEvent_t producer_event;                         /* legacy fallback (v0.2 proto) */
+    cudaEvent_t slot_events[CUFRAMES_MAX_RING];         /* v0.3 — per-slot events */
+    int has_slot_events;                                 /* 1 if v0.3 events opened OK */
    void *mapped_ptrs[CUFRAMES_MAX_RING];

    uint32_t assigned_bit;
    uint64_t last_seen_seq;

-    /* Frame pool — переиспользуем одну frame_t structure (single-thread API).
-     * Опционально расширим до lock-free pool в v0.2 если нужен multi-frame. */
+    /* Frame pool — переиспользуем одну frame_t structure (single-thread API). */
    struct cuframes_frame frame_obj;
    int frame_busy;
+
+    /* v0.2 — packet ring (optional, opened via enable_packets). */
+    int                 has_pkt_ring;
+    cuframes_pkt_ring_t pkt_ring;
+    uint64_t            last_packet_seq;     /* UINT64_MAX = no packet read yet */
+    struct cuframes_packet packet_obj;
+    int                 packet_busy;
 };

 /* ─── Frame accessors ────────────────────────────────────────────────── */
@@ -183,13 +203,37 @@ int cuframes_subscriber_create(const cuframes_subscriber_config_t *cfg,
        r = CUFRAMES_ERR_CUDA; goto fail;
    }

-    /* Open producer's event */
+    /* Open producer's event (legacy single — v0.2 compat fallback) */
    cerr = cudaIpcOpenEventHandle(&sub->producer_event, sub->hdr->ipc_event_handle);
    if (cerr != cudaSuccess) {
        CUFRAMES_LOG_ERROR("cudaIpcOpenEventHandle: %s", cudaGetErrorString(cerr));
        r = CUFRAMES_ERR_CUDA; goto fail;
    }

+    /* v0.3 — open per-slot events если protocol supports. */
+    sub->has_slot_events = 0;
+    if (sub->hdr->proto_version >= CUFRAMES_PROTOCOL_V3) {
+        int ring_evt = (int)sub->hdr->ring_size;
+        if (ring_evt > CUFRAMES_MAX_RING) ring_evt = CUFRAMES_MAX_RING;
+        int evt_ok = 1;
+        for (int i = 0; i < ring_evt; i++) {
+            cerr = cudaIpcOpenEventHandle(&sub->slot_events[i],
+                                           sub->hdr->slot_event_handles[i]);
+            if (cerr != cudaSuccess) {
+                CUFRAMES_LOG_WARN("cudaIpcOpenEventHandle slot %d: %s — "
+                                   "fallback к legacy single event",
+                                   i, cudaGetErrorString(cerr));
+                for (int j = 0; j < i; j++) cudaEventDestroy(sub->slot_events[j]);
+                evt_ok = 0;
+                break;
+            }
+        }
+        if (evt_ok) {
+            sub->has_slot_events = 1;
+            CUFRAMES_LOG_INFO("subscribed с per-slot events (v0.3 proto)");
+        }
+    }
+
    /* Open mem handles */
    int ring = (int)sub->hdr->ring_size;
    if (ring > CUFRAMES_MAX_RING) ring = CUFRAMES_MAX_RING;
@@ -257,10 +301,16 @@ int cuframes_subscriber_next(cuframes_subscriber_t *sub,
            int64_t pts = atomic_load_explicit(&sub->hdr->slots[slot_idx].pts_ns,
                                                memory_order_acquire);

-            /* Cross-process sync: wait event on consumer's stream */
+            /* Cross-process sync: wait event on consumer's stream.
+             * v0.3: per-slot event точно соответствует slot[slot_idx] —
+             * no TOCTOU race possible. v0.2 fallback: single global event +
+             * post-sync verify (less precise, but still correct). */
+            cudaEvent_t sync_event = sub->has_slot_events
+                ? sub->slot_events[slot_idx]
+                : sub->producer_event;
            if (consumer_stream) {
                cudaError_t cerr = cudaStreamWaitEvent((cudaStream_t)consumer_stream,
-                                                         sub->producer_event, 0);
+                                                         sync_event, 0);
                if (cerr != cudaSuccess) {
                    CUFRAMES_LOG_WARN("cudaStreamWaitEvent: %s",
                                       cudaGetErrorString(cerr));
@@ -268,10 +318,21 @@ int cuframes_subscriber_next(cuframes_subscriber_t *sub,
                }
            } else {
                /* Synchronize globally — для cudaMemcpyDeviceToHost users */
-                cudaError_t cerr = cudaEventSynchronize(sub->producer_event);
+                cudaError_t cerr = cudaEventSynchronize(sync_event);
                if (cerr != cudaSuccess) return CUFRAMES_ERR_CUDA;
            }

+            /* TOCTOU защита (v0.2 fallback only): legacy single event signals
+             * для последнего published frame. v0.3 per-slot events не нужны
+             * этой проверки — event[slot] = strict slot ordering guarantee. */
+            if (!sub->has_slot_events) {
+                uint64_t verify_seq = atomic_load_explicit(&sub->hdr->slots[slot_idx].seq,
+                                                           memory_order_acquire);
+                if (verify_seq != target_seq) {
+                    continue;
+                }
+            }
+
            /* Fill frame_out */
            struct cuframes_frame *f = &sub->frame_obj;
            f->cuda_ptr = sub->mapped_ptrs[slot_idx];
@@ -340,6 +401,13 @@ int cuframes_subscriber_destroy(cuframes_subscriber_t *sub) {
    }

    if (sub->producer_event) cudaEventDestroy(sub->producer_event);
+    if (sub->has_slot_events) {
+        int ring_evt = (int)sub->hdr->ring_size;
+        if (ring_evt > CUFRAMES_MAX_RING) ring_evt = CUFRAMES_MAX_RING;
+        for (int i = 0; i < ring_evt; i++) {
+            if (sub->slot_events[i]) cudaEventDestroy(sub->slot_events[i]);
+        }
+    }

    int ring = sub->hdr ? (int)sub->hdr->ring_size : 0;
    if (ring > CUFRAMES_MAX_RING) ring = CUFRAMES_MAX_RING;
@@ -347,9 +415,143 @@ int cuframes_subscriber_destroy(cuframes_subscriber_t *sub) {
        if (sub->mapped_ptrs[i]) cudaIpcCloseMemHandle(sub->mapped_ptrs[i]);
    }

+    /* Packet ring cleanup */
+    if (sub->has_pkt_ring) {
+        cuframes_internal_pkt_ring_destroy(&sub->pkt_ring);
+    }
+    if (sub->packet_obj.data) {
+        free(sub->packet_obj.data);
+        sub->packet_obj.data = NULL;
+    }
+
    if (sub->hdr) munmap(sub->hdr, sizeof(cuframes_shm_header_t));
    if (sub->shm_fd >= 0) close(sub->shm_fd);
    if (sub->sock_fd >= 0) close(sub->sock_fd);
    free(sub);
    return CUFRAMES_OK;
 }
+
+/* ─────────────────────────────────────────────────────────────────────── */
+/* v0.2 — encoded packet ring API (см. docs/protocol.md §10)                */
+/* ─────────────────────────────────────────────────────────────────────── */
+
+/* Packet accessors */
+const void *cuframes_packet_data(const cuframes_packet_t *p)  { return p ? p->data : NULL; }
+size_t      cuframes_packet_size(const cuframes_packet_t *p)  { return p ? p->size : 0; }
+int64_t     cuframes_packet_pts(const cuframes_packet_t *p)   { return p ? p->pts_ns : 0; }
+int64_t     cuframes_packet_dts(const cuframes_packet_t *p)   { return p ? p->dts_ns : 0; }
+uint32_t    cuframes_packet_flags(const cuframes_packet_t *p) { return p ? p->flags : 0; }
+uint64_t    cuframes_packet_seq(const cuframes_packet_t *p)   { return p ? p->seq : 0; }
+
+int cuframes_subscriber_enable_packets(cuframes_subscriber_t *sub) {
+    if (!sub) return CUFRAMES_ERR_INVALID_ARG;
+    if (sub->has_pkt_ring) return CUFRAMES_OK; /* idempotent */
+
+    char pkt_name[128];
+    int r = cuframes_internal_pkt_shm_name(sub->key, pkt_name, sizeof(pkt_name));
+    if (r != CUFRAMES_OK) return r;
+
+    r = cuframes_internal_pkt_ring_open(pkt_name, &sub->pkt_ring);
+    if (r != CUFRAMES_OK) return r;
+
+    /* Allocate copy-buffer (max packet size). Используем data_size как
+     * conservative upper bound (publisher гарантирует data_size >= max_packet_size). */
+    size_t capacity = sub->pkt_ring.hdr->data_size;
+    sub->packet_obj.data = (uint8_t *)malloc(capacity);
+    if (!sub->packet_obj.data) {
+        cuframes_internal_pkt_ring_destroy(&sub->pkt_ring);
+        return CUFRAMES_ERR_OUT_OF_MEMORY;
+    }
+    sub->packet_obj.capacity = capacity;
+
+    /* Start с last_keyframe_seq - 1 → первый read даст IDR (§10.14). */
+    uint64_t kf = atomic_load_explicit(&sub->pkt_ring.hdr->last_keyframe_seq,
+                                        memory_order_acquire);
+    sub->last_packet_seq = (kf == UINT64_MAX) ? UINT64_MAX : kf - 1;
+    sub->has_pkt_ring = 1;
+    return CUFRAMES_OK;
+}
+
+int cuframes_subscriber_next_packet(cuframes_subscriber_t *sub,
+                                     cuframes_packet_t **pkt_out,
+                                     int32_t timeout_ms) {
+    if (!sub || !pkt_out) return CUFRAMES_ERR_INVALID_ARG;
+    if (!sub->has_pkt_ring) return CUFRAMES_ERR_NO_PACKET_RING;
+    if (sub->packet_busy) return CUFRAMES_ERR_INVALID_ARG; /* previous packet not released */
+
+    int64_t deadline_ns = (timeout_ms > 0) ?
+        cuframes_now_ns() + (int64_t)timeout_ms * 1000000LL : 0;
+
+    for (;;) {
+        size_t size = 0;
+        int64_t pts = 0, dts = 0;
+        uint32_t flags = 0;
+        uint64_t seq_attempt = sub->last_packet_seq;
+
+        int r = cuframes_internal_pkt_ring_read(&sub->pkt_ring,
+                                                 &seq_attempt,
+                                                 sub->packet_obj.data,
+                                                 sub->packet_obj.capacity,
+                                                 &size, &pts, &dts, &flags);
+        if (r == CUFRAMES_OK) {
+            sub->last_packet_seq      = seq_attempt;
+            sub->packet_obj.size      = size;
+            sub->packet_obj.pts_ns    = pts;
+            sub->packet_obj.dts_ns    = dts;
+            sub->packet_obj.flags     = flags;
+            sub->packet_obj.seq       = seq_attempt;
+            sub->packet_busy = 1;
+            *pkt_out = &sub->packet_obj;
+            return CUFRAMES_OK;
+        }
+
+        if (r == CUFRAMES_ERR_PACKET_OVERRUN) {
+            /* Resync — установить last_seq = last_keyframe_seq - 1, повторить. */
+            uint64_t kf = atomic_load_explicit(
+                &sub->pkt_ring.hdr->last_keyframe_seq, memory_order_acquire);
+            if (kf != UINT64_MAX) {
+                sub->last_packet_seq = kf - 1;
+            }
+            /* Возвращаем OVERRUN наружу — caller знает что был discontinuity. */
+            *pkt_out = NULL;
+            return CUFRAMES_ERR_PACKET_OVERRUN;
+        }
+
+        if (r != CUFRAMES_ERR_TIMEOUT) {
+            *pkt_out = NULL;
+            return r; /* DISCONNECTED, INVALID_ARG, etc. */
+        }
+
+        /* TIMEOUT branch — poll/sleep */
+        if (timeout_ms == 0) return CUFRAMES_ERR_WOULD_BLOCK;
+        if (timeout_ms > 0 && cuframes_now_ns() >= deadline_ns) {
+            return CUFRAMES_ERR_TIMEOUT;
+        }
+        struct timespec ts = {0, 1 * 1000 * 1000}; /* 1 ms poll interval */
+        nanosleep(&ts, NULL);
+    }
+}
+
+int cuframes_subscriber_release_packet(cuframes_subscriber_t *sub,
+                                        cuframes_packet_t *pkt) {
+    if (!sub) return CUFRAMES_ERR_INVALID_ARG;
+    if (!pkt) return CUFRAMES_OK; /* NULL-safe */
+    if (pkt != &sub->packet_obj) return CUFRAMES_ERR_INVALID_ARG;
+    sub->packet_busy = 0;
+    return CUFRAMES_OK;
+}
+
+int cuframes_subscriber_get_codec_params(cuframes_subscriber_t *sub,
+                                          uint32_t *codec_id_out,
+                                          const void **extradata_out,
+                                          size_t *extradata_size_out) {
+    if (!sub) return CUFRAMES_ERR_INVALID_ARG;
+    if (!sub->has_pkt_ring) return CUFRAMES_ERR_NO_PACKET_RING;
+    cuframes_pkt_header_t *hdr = sub->pkt_ring.hdr;
+    if (codec_id_out)       *codec_id_out = hdr->codec_id;
+    /* Если extradata ещё не выставлен publisher'ом — size=0, pointer ok но empty. */
+    if (extradata_out)      *extradata_out = hdr->codec_extradata;
+    if (extradata_size_out) *extradata_size_out = hdr->codec_extradata_size;
+    if (hdr->codec_extradata_size == 0) return CUFRAMES_ERR_NO_CODEC_PARAMS;
+    return CUFRAMES_OK;
+}
@@ -23,12 +23,29 @@

 #define CUFRAMES_MAGIC          0xCC7C1DCCu
 #define CUFRAMES_PROTOCOL_V1    1u
+#define CUFRAMES_PROTOCOL_V2    2u                  /* v0.2 — packet ring support */
+#define CUFRAMES_PROTOCOL_V3    3u                  /* v0.3 — per-slot CUDA events (no TOCTOU race) */
 #define CUFRAMES_MAX_SUBSCRIBERS 32
 #define CUFRAMES_MAX_RING       16
 #define CUFRAMES_MAX_KEY_LEN    63
 #define CUFRAMES_MAX_NAME_LEN   31
 #define CUFRAMES_RUNTIME_DIR    "/run/cuframes"
 #define CUFRAMES_SHM_PREFIX     "/cuframes-"
+#define CUFRAMES_PKT_SHM_SUFFIX "-packets"          /* /cuframes-<key>-packets */
+
+/* Packet ring constants (см. docs/protocol.md §10) */
+#define CUFRAMES_PKT_MAGIC              0xCC7C1DCDu /* frames magic + 1 */
+#define CUFRAMES_PKT_EXTRADATA_MAX      4096u
+#define CUFRAMES_PKT_DEFAULT_SLOTS      64u
+#define CUFRAMES_PKT_DEFAULT_DATA_SIZE  (8u * 1024u * 1024u)   /* 8 MB */
+#define CUFRAMES_PKT_DEFAULT_MAX_SIZE   (2u * 1024u * 1024u)   /* 2 MB */
+#define CUFRAMES_PKT_MAX_SLOTS          1024u
+
+/* Packet flags (см. docs/protocol.md §10.6) */
+#define CUFRAMES_PKT_FLAG_KEY           0x01u
+#define CUFRAMES_PKT_FLAG_CORRUPT       0x02u
+#define CUFRAMES_PKT_FLAG_DISCONTINUITY 0x04u
+#define CUFRAMES_PKT_FLAG_LAST_IN_AU    0x08u

 /* ─── Shared memory layout (см. docs/protocol.md §2) ──────────────────── */

@@ -91,6 +108,11 @@ typedef struct __attribute__((packed)) cuframes_shm_header {
    /* offset 0x100 — variable-length tail */
    cuframes_shm_slot_t slots[CUFRAMES_MAX_RING];                /* 192 × 16 = 3072 */
    cuframes_shm_subscriber_t subscribers[CUFRAMES_MAX_SUBSCRIBERS]; /* 128 × 32 = 4096 */
+    /* v0.3 — per-slot CUDA event handles. Producer records event per publish;
+     * consumer waits event[slot_idx] specifically (не global ipc_event_handle
+     * который signals только для последнего published frame). Закрывает TOCTOU
+     * race в slot read. 64 × 16 = 1024 bytes. */
+    cudaIpcEventHandle_t slot_event_handles[CUFRAMES_MAX_RING];
 } cuframes_shm_header_t;

 /* Layout sanity checks (docs/protocol.md §2 table) */
@@ -103,6 +125,73 @@ _Static_assert(offsetof(cuframes_shm_header_t, ipc_event_handle) == 0x0080, "eve
 _Static_assert(offsetof(cuframes_shm_header_t, global_seq) == 0x00C0, "global_seq offset");
 _Static_assert(offsetof(cuframes_shm_header_t, slots) == 0x0100, "slots offset");

+/* ─── Packet ring shared memory layout (docs/protocol.md §10) ──────────── */
+
+/* Packet slot entry — packed 64 байт */
+typedef struct __attribute__((packed)) cuframes_pkt_slot {
+    _Atomic uint64_t seq;          /* UINT64_MAX = invalid */
+    int64_t          pts_ns;
+    int64_t          dts_ns;
+    uint64_t         data_offset;  /* absolute byte cursor; % data_size = ring offset */
+    uint32_t         data_size;
+    uint32_t         flags;
+    uint8_t          reserved[24];
+} cuframes_pkt_slot_t;
+_Static_assert(sizeof(cuframes_pkt_slot_t) == 64, "packet slot must be 64 bytes");
+
+/* Packet ring header (fixed 0x1040 = 4160 bytes). Followed by slots[N] + data[]. */
+typedef struct __attribute__((packed)) cuframes_pkt_header {
+    uint32_t magic;                     /* CUFRAMES_PKT_MAGIC */
+    uint32_t proto_version;             /* 2 */
+    uint32_t ring_slots;
+    uint32_t data_size;
+    uint32_t codec_id;                  /* AV_CODEC_ID_H264 / HEVC / ... */
+    uint32_t codec_extradata_size;      /* ≤ CUFRAMES_PKT_EXTRADATA_MAX */
+    uint64_t producer_pid;
+    _Atomic uint64_t global_seq;
+    _Atomic uint64_t last_keyframe_seq;
+    _Atomic uint64_t write_offset;
+    _Atomic uint64_t shutdown_flag;
+    uint8_t  codec_extradata[CUFRAMES_PKT_EXTRADATA_MAX];
+    /* offset 0x1040 — slots[ring_slots], then data[data_size] */
+} cuframes_pkt_header_t;
+
+_Static_assert(offsetof(cuframes_pkt_header_t, magic) == 0x0000, "pkt magic offset");
+_Static_assert(offsetof(cuframes_pkt_header_t, proto_version) == 0x0004, "pkt proto offset");
+_Static_assert(offsetof(cuframes_pkt_header_t, producer_pid) == 0x0018, "pkt pid offset");
+_Static_assert(offsetof(cuframes_pkt_header_t, global_seq) == 0x0020, "pkt global_seq offset");
+_Static_assert(offsetof(cuframes_pkt_header_t, write_offset) == 0x0030, "pkt write_offset offset");
+_Static_assert(offsetof(cuframes_pkt_header_t, codec_extradata) == 0x0040, "pkt extradata offset");
+_Static_assert(sizeof(cuframes_pkt_header_t) == 0x1040, "pkt header must be 0x1040 bytes");
+
+/* Computed SHM layout helper:
+ *   total = sizeof(cuframes_pkt_header_t) + slots*sizeof(slot) + data_size
+ */
+static inline size_t cuframes_pkt_shm_size(uint32_t slots, uint32_t data_size) {
+    return sizeof(cuframes_pkt_header_t)
+         + (size_t)slots * sizeof(cuframes_pkt_slot_t)
+         + (size_t)data_size;
+}
+
+/* Pointers into mmap'ed pkt SHM (computed from header base) */
+static inline cuframes_pkt_slot_t * cuframes_pkt_slots(cuframes_pkt_header_t *hdr) {
+    return (cuframes_pkt_slot_t *)((uint8_t *)hdr + sizeof(cuframes_pkt_header_t));
+}
+static inline uint8_t * cuframes_pkt_data(cuframes_pkt_header_t *hdr) {
+    return (uint8_t *)hdr + sizeof(cuframes_pkt_header_t)
+         + (size_t)hdr->ring_slots * sizeof(cuframes_pkt_slot_t);
+}
+
+/* Opaque ring handle — содержит state и mapping для publisher или subscriber. */
+typedef struct cuframes_pkt_ring {
+    int    shm_fd;
+    void  *shm_base;
+    size_t shm_size;
+    cuframes_pkt_header_t *hdr;
+    char   shm_name[128];   /* /cuframes-<key>-packets */
+    int    is_publisher;
+} cuframes_pkt_ring_t;
+
 /* ─── Socket protocol messages (docs/protocol.md §3) ───────────────────── */

 #define CUFRAMES_MSG_HELLO_REQ      0x01
@@ -164,6 +253,8 @@ typedef struct __attribute__((packed)) cuframes_msg_subscribe_resp {
 int cuframes_internal_socket_path(const char *key, char *out, size_t out_size);
 /* Build /cuframes-<key> (for shm_open) */
 int cuframes_internal_shm_name(const char *key, char *out, size_t out_size);
+/* Build /cuframes-<key>-packets (for shm_open) */
+int cuframes_internal_pkt_shm_name(const char *key, char *out, size_t out_size);
 /* Validate key per protocol.md (alphanum/_/-, 1..63 chars) */
 int cuframes_internal_validate_key(const char *key);
 /* Calculate frame size + pitch для format/W/H */
@@ -181,4 +272,48 @@ int cuframes_internal_recv_msg(int sock_fd, uint32_t *msg_type_out,
                                void *payload, uint32_t *payload_len_inout,
                                int32_t timeout_ms);

+/* ─── Packet ring helpers (libcuframes/src/packet_ring.c) ─────────────── */
+
+/* Publisher: create SHM + initialize header + slots. Stale recovery как у frames. */
+int cuframes_internal_pkt_ring_create(const char *key,
+                                       uint32_t slots,
+                                       uint32_t data_size,
+                                       uint32_t codec_id,
+                                       cuframes_pkt_ring_t *ring_out);
+
+/* Publisher: set codec extradata (SPS/PPS). Must be called before first publish.
+ * Если size > CUFRAMES_PKT_EXTRADATA_MAX → ERR_INVALID_ARG. */
+int cuframes_internal_pkt_ring_set_extradata(cuframes_pkt_ring_t *ring,
+                                              const void *extradata,
+                                              size_t size);
+
+/* Publisher: publish single encoded packet. Slow consumer = overwrite oldest.
+ * Returns CUFRAMES_ERR_PACKET_OVERSIZED если size > data_size. */
+int cuframes_internal_pkt_ring_publish(cuframes_pkt_ring_t *ring,
+                                        const void *data, size_t size,
+                                        int64_t pts_ns, int64_t dts_ns,
+                                        uint32_t flags);
+
+/* Subscriber: open existing SHM by shm name (from HELLO_RESP packet_shm_path). */
+int cuframes_internal_pkt_ring_open(const char *shm_name,
+                                     cuframes_pkt_ring_t *ring_out);
+
+/* Subscriber: read next packet.
+ *   *seq_inout — currently held seq (we read seq_inout+1); updated on success.
+ *   out_buf must have ≥ max_packet_size bytes; out_size receives actual size.
+ * Returns:
+ *   CUFRAMES_OK on success
+ *   CUFRAMES_ERR_PACKET_OVERRUN если publisher уехал — caller resync on keyframe
+ *   CUFRAMES_ERR_TIMEOUT если нет нового packet
+ *   CUFRAMES_ERR_DISCONNECTED если publisher shutdown */
+int cuframes_internal_pkt_ring_read(cuframes_pkt_ring_t *ring,
+                                     uint64_t *seq_inout,
+                                     void *out_buf, size_t out_buf_max,
+                                     size_t *out_size,
+                                     int64_t *out_pts, int64_t *out_dts,
+                                     uint32_t *out_flags);
+
+/* Publisher OR Subscriber: cleanup mmap + close FD. Publisher additionally shm_unlink. */
+void cuframes_internal_pkt_ring_destroy(cuframes_pkt_ring_t *ring);
+
 #endif /* CUFRAMES_INTERNAL_H */
@@ -0,0 +1,380 @@
+/* libcuframes/src/packet_ring.c
+ *
+ * Variable-length encoded packet ring buffer (docs/protocol.md §10).
+ *
+ * Использует POSIX shared memory (`/cuframes-<key>-packets`), packed
+ * structures с _Atomic полями, seqlock-style read для защиты от overrun
+ * mid-read.
+ *
+ * Этот модуль внутренний — exposed API будет в Step 3 (cuframes.h
+ * extension). Сейчас functions имеют prefix `cuframes_internal_pkt_ring_*`
+ * и используются из producer.c / consumer.c.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "internal.h"
+
+/* ─── Internal helpers ────────────────────────────────────────────────── */
+
+static void wraparound_memcpy(uint8_t *dst, const uint8_t *src, size_t n,
+                              size_t buf_size, size_t offset) {
+    /* Запись n байт начиная с offset в buf размера buf_size, wraparound. */
+    size_t off = offset % buf_size;
+    size_t first = n;
+    if (first > buf_size - off) first = buf_size - off;
+    memcpy(dst + off, src, first);
+    if (first < n) {
+        memcpy(dst, src + first, n - first);
+    }
+}
+
+static void wraparound_memcpy_from(uint8_t *out, const uint8_t *buf,
+                                   size_t buf_size, size_t offset, size_t n) {
+    /* Чтение n байт из buf с wraparound от offset. */
+    size_t off = offset % buf_size;
+    size_t first = n;
+    if (first > buf_size - off) first = buf_size - off;
+    memcpy(out, buf + off, first);
+    if (first < n) {
+        memcpy(out + first, buf, n - first);
+    }
+}
+
+/* ─── Publisher API ───────────────────────────────────────────────────── */
+
+int cuframes_internal_pkt_ring_create(const char *key,
+                                       uint32_t slots,
+                                       uint32_t data_size,
+                                       uint32_t codec_id,
+                                       cuframes_pkt_ring_t *ring_out) {
+    if (!ring_out) return CUFRAMES_ERR_INVALID_ARG;
+    if (slots == 0 || slots > CUFRAMES_PKT_MAX_SLOTS) return CUFRAMES_ERR_INVALID_ARG;
+    if (data_size == 0) return CUFRAMES_ERR_INVALID_ARG;
+
+    memset(ring_out, 0, sizeof(*ring_out));
+    ring_out->shm_fd = -1;
+    ring_out->is_publisher = 1;
+
+    int r = cuframes_internal_pkt_shm_name(key, ring_out->shm_name,
+                                            sizeof(ring_out->shm_name));
+    if (r != CUFRAMES_OK) return r;
+
+    /* Stale recovery (как в frames SHM) */
+    int fd = shm_open(ring_out->shm_name, O_CREAT | O_EXCL | O_RDWR, 0644);
+    if (fd < 0) {
+        if (errno == EEXIST) {
+            int existing = shm_open(ring_out->shm_name, O_RDWR, 0);
+            if (existing >= 0) {
+                cuframes_pkt_header_t tmp;
+                ssize_t rb = read(existing, &tmp, sizeof(tmp));
+                close(existing);
+                if (rb == (ssize_t)sizeof(tmp) && tmp.magic == CUFRAMES_PKT_MAGIC) {
+                    if (cuframes_internal_pid_alive((pid_t)tmp.producer_pid)) {
+                        CUFRAMES_LOG_ERROR("packet ring %s: publisher pid %lu still alive",
+                                           ring_out->shm_name,
+                                           (unsigned long)tmp.producer_pid);
+                        return CUFRAMES_ERR_ALREADY_EXISTS;
+                    }
+                }
+            }
+            CUFRAMES_LOG_INFO("stale packet shm %s — unlinking", ring_out->shm_name);
+            shm_unlink(ring_out->shm_name);
+            fd = shm_open(ring_out->shm_name, O_CREAT | O_EXCL | O_RDWR, 0644);
+            if (fd < 0) {
+                CUFRAMES_LOG_ERROR("packet shm_open after unlink: %s", strerror(errno));
+                return CUFRAMES_ERR_IO;
+            }
+        } else {
+            CUFRAMES_LOG_ERROR("packet shm_open: %s", strerror(errno));
+            return CUFRAMES_ERR_IO;
+        }
+    }
+
+    size_t total_size = cuframes_pkt_shm_size(slots, data_size);
+    if (ftruncate(fd, (off_t)total_size) < 0) {
+        CUFRAMES_LOG_ERROR("packet ftruncate(%zu): %s", total_size, strerror(errno));
+        close(fd);
+        shm_unlink(ring_out->shm_name);
+        return CUFRAMES_ERR_IO;
+    }
+
+    void *base = mmap(NULL, total_size, PROT_READ | PROT_WRITE,
+                      MAP_SHARED, fd, 0);
+    if (base == MAP_FAILED) {
+        CUFRAMES_LOG_ERROR("packet mmap: %s", strerror(errno));
+        close(fd);
+        shm_unlink(ring_out->shm_name);
+        return CUFRAMES_ERR_IO;
+    }
+
+    ring_out->shm_fd = fd;
+    ring_out->shm_base = base;
+    ring_out->shm_size = total_size;
+    ring_out->hdr = (cuframes_pkt_header_t *)base;
+
+    /* Initialize header — нули + magic/version/sizes */
+    memset(ring_out->hdr, 0, sizeof(*ring_out->hdr));
+    ring_out->hdr->magic = CUFRAMES_PKT_MAGIC;
+    ring_out->hdr->proto_version = CUFRAMES_PROTOCOL_V2;
+    ring_out->hdr->ring_slots = slots;
+    ring_out->hdr->data_size = data_size;
+    ring_out->hdr->codec_id = codec_id;
+    ring_out->hdr->codec_extradata_size = 0;
+    ring_out->hdr->producer_pid = (uint64_t)getpid();
+    atomic_store_explicit(&ring_out->hdr->global_seq, UINT64_MAX,
+                          memory_order_release);
+    atomic_store_explicit(&ring_out->hdr->last_keyframe_seq, UINT64_MAX,
+                          memory_order_release);
+    atomic_store_explicit(&ring_out->hdr->write_offset, 0,
+                          memory_order_release);
+    atomic_store_explicit(&ring_out->hdr->shutdown_flag, 0,
+                          memory_order_release);
+
+    /* Initialize slots — invalid seq markers */
+    cuframes_pkt_slot_t *slots_arr = cuframes_pkt_slots(ring_out->hdr);
+    for (uint32_t i = 0; i < slots; ++i) {
+        atomic_store_explicit(&slots_arr[i].seq, UINT64_MAX,
+                              memory_order_release);
+    }
+
+    /* Data section уже zeroed через ftruncate (POSIX guarantees) */
+
+    CUFRAMES_LOG_INFO("packet ring %s: slots=%u data_size=%u codec_id=%u (total=%zu bytes)",
+                      ring_out->shm_name, slots, data_size, codec_id, total_size);
+    return CUFRAMES_OK;
+}
+
+int cuframes_internal_pkt_ring_set_extradata(cuframes_pkt_ring_t *ring,
+                                              const void *extradata,
+                                              size_t size) {
+    if (!ring || !ring->hdr) return CUFRAMES_ERR_INVALID_ARG;
+    if (!ring->is_publisher) return CUFRAMES_ERR_INVALID_ARG;
+    if (size > CUFRAMES_PKT_EXTRADATA_MAX) return CUFRAMES_ERR_INVALID_ARG;
+    if (size > 0 && !extradata) return CUFRAMES_ERR_INVALID_ARG;
+
+    /* Записываем сначала bytes, потом size (release-style — subscriber видит size>0 только когда extradata готов). */
+    if (size > 0) {
+        memcpy(ring->hdr->codec_extradata, extradata, size);
+        /* Memory barrier — extradata stores complete до size update. */
+        __atomic_thread_fence(__ATOMIC_RELEASE);
+    }
+    ring->hdr->codec_extradata_size = (uint32_t)size;
+    return CUFRAMES_OK;
+}
+
+int cuframes_internal_pkt_ring_publish(cuframes_pkt_ring_t *ring,
+                                        const void *data, size_t size,
+                                        int64_t pts_ns, int64_t dts_ns,
+                                        uint32_t flags) {
+    if (!ring || !ring->hdr) return CUFRAMES_ERR_INVALID_ARG;
+    if (!ring->is_publisher) return CUFRAMES_ERR_INVALID_ARG;
+    if (size == 0 || !data) return CUFRAMES_ERR_INVALID_ARG;
+    if (size > ring->hdr->data_size) return CUFRAMES_ERR_PACKET_OVERSIZED;
+
+    cuframes_pkt_header_t *hdr = ring->hdr;
+
+    /* Allocate next seq + cursor offset. Single-publisher — без CAS. */
+    uint64_t prev_seq = atomic_load_explicit(&hdr->global_seq,
+                                             memory_order_relaxed);
+    uint64_t new_seq = (prev_seq == UINT64_MAX) ? 0 : prev_seq + 1;
+
+    uint64_t write_off = atomic_load_explicit(&hdr->write_offset,
+                                              memory_order_relaxed);
+
+    /* Записать payload в data ring (wraparound aware) */
+    wraparound_memcpy(cuframes_pkt_data(hdr), data, size,
+                      hdr->data_size, write_off);
+
+    /* Записать slot metadata. Slot index = seq % ring_slots. */
+    uint32_t slot_idx = (uint32_t)(new_seq % hdr->ring_slots);
+    cuframes_pkt_slot_t *slot = &cuframes_pkt_slots(hdr)[slot_idx];
+
+    slot->pts_ns = pts_ns;
+    slot->dts_ns = dts_ns;
+    slot->data_offset = write_off;
+    slot->data_size = (uint32_t)size;
+    slot->flags = flags;
+
+    /* RELEASE order — payload bytes + slot metadata готовы перед publish seq. */
+    atomic_store_explicit(&slot->seq, new_seq, memory_order_release);
+
+    /* Update global cursor + global_seq. */
+    atomic_store_explicit(&hdr->write_offset, write_off + size,
+                          memory_order_release);
+    atomic_store_explicit(&hdr->global_seq, new_seq,
+                          memory_order_release);
+
+    /* Keyframe — update last_keyframe_seq для late subscribers. */
+    if (flags & CUFRAMES_PKT_FLAG_KEY) {
+        atomic_store_explicit(&hdr->last_keyframe_seq, new_seq,
+                              memory_order_release);
+    }
+
+    return CUFRAMES_OK;
+}
+
+/* ─── Subscriber API ──────────────────────────────────────────────────── */
+
+int cuframes_internal_pkt_ring_open(const char *shm_name,
+                                     cuframes_pkt_ring_t *ring_out) {
+    if (!shm_name || !ring_out) return CUFRAMES_ERR_INVALID_ARG;
+
+    memset(ring_out, 0, sizeof(*ring_out));
+    ring_out->shm_fd = -1;
+    ring_out->is_publisher = 0;
+    strncpy(ring_out->shm_name, shm_name, sizeof(ring_out->shm_name) - 1);
+
+    int fd = shm_open(shm_name, O_RDONLY, 0);
+    if (fd < 0) {
+        if (errno == ENOENT) return CUFRAMES_ERR_NOT_FOUND;
+        CUFRAMES_LOG_ERROR("packet shm_open(%s) ro: %s", shm_name, strerror(errno));
+        return CUFRAMES_ERR_IO;
+    }
+
+    /* Прочитать header чтобы узнать total size */
+    cuframes_pkt_header_t header_peek;
+    ssize_t rb = read(fd, &header_peek, sizeof(header_peek));
+    if (rb != (ssize_t)sizeof(header_peek)) {
+        close(fd);
+        return CUFRAMES_ERR_IO;
+    }
+    if (header_peek.magic != CUFRAMES_PKT_MAGIC) {
+        CUFRAMES_LOG_ERROR("packet shm %s: bad magic 0x%08x", shm_name, header_peek.magic);
+        close(fd);
+        return CUFRAMES_ERR_PROTOCOL;
+    }
+    if (header_peek.proto_version != CUFRAMES_PROTOCOL_V2) {
+        CUFRAMES_LOG_ERROR("packet shm %s: proto_version=%u (expected %u)",
+                           shm_name, header_peek.proto_version, CUFRAMES_PROTOCOL_V2);
+        close(fd);
+        return CUFRAMES_ERR_PROTOCOL;
+    }
+
+    size_t total = cuframes_pkt_shm_size(header_peek.ring_slots,
+                                          header_peek.data_size);
+
+    /* mmap полностью read-only */
+    void *base = mmap(NULL, total, PROT_READ, MAP_SHARED, fd, 0);
+    if (base == MAP_FAILED) {
+        CUFRAMES_LOG_ERROR("packet mmap ro: %s", strerror(errno));
+        close(fd);
+        return CUFRAMES_ERR_IO;
+    }
+
+    ring_out->shm_fd = fd;
+    ring_out->shm_base = base;
+    ring_out->shm_size = total;
+    ring_out->hdr = (cuframes_pkt_header_t *)base;
+
+    CUFRAMES_LOG_INFO("packet ring %s opened: slots=%u data_size=%u",
+                      shm_name, header_peek.ring_slots, header_peek.data_size);
+    return CUFRAMES_OK;
+}
+
+int cuframes_internal_pkt_ring_read(cuframes_pkt_ring_t *ring,
+                                     uint64_t *seq_inout,
+                                     void *out_buf, size_t out_buf_max,
+                                     size_t *out_size,
+                                     int64_t *out_pts, int64_t *out_dts,
+                                     uint32_t *out_flags) {
+    if (!ring || !ring->hdr || !seq_inout || !out_buf || !out_size
+        || !out_pts || !out_dts || !out_flags) {
+        return CUFRAMES_ERR_INVALID_ARG;
+    }
+
+    cuframes_pkt_header_t *hdr = ring->hdr;
+
+    /* Publisher shutdown? */
+    if (atomic_load_explicit(&hdr->shutdown_flag, memory_order_acquire) != 0) {
+        return CUFRAMES_ERR_DISCONNECTED;
+    }
+
+    /* Текущий published seq */
+    uint64_t cur = atomic_load_explicit(&hdr->global_seq, memory_order_acquire);
+    if (cur == UINT64_MAX) return CUFRAMES_ERR_TIMEOUT; /* нет published */
+    if (*seq_inout != UINT64_MAX && cur <= *seq_inout) {
+        return CUFRAMES_ERR_TIMEOUT;
+    }
+
+    /* Calculate the next seq we want (handle первый read с UINT64_MAX → start с 0) */
+    uint64_t want_seq = (*seq_inout == UINT64_MAX) ? 0 : (*seq_inout + 1);
+
+    /* Если want_seq < cur и slot уже перезаписан — попадаем в OVERRUN */
+    if (cur - want_seq >= hdr->ring_slots) {
+        /* Скорее всего slot уже rewritten. Подсказка caller'у — resync. */
+        return CUFRAMES_ERR_PACKET_OVERRUN;
+    }
+
+    uint32_t slot_idx = (uint32_t)(want_seq % hdr->ring_slots);
+    cuframes_pkt_slot_t *slot = &cuframes_pkt_slots(hdr)[slot_idx];
+
+    /* Seqlock-style read: load seq, prove not overwritten после copy. */
+    uint64_t s1 = atomic_load_explicit(&slot->seq, memory_order_acquire);
+    if (s1 != want_seq) {
+        /* Slot уже занят следующим packet'ом — overrun. */
+        return CUFRAMES_ERR_PACKET_OVERRUN;
+    }
+
+    /* Снять metadata (non-atomic — read OK поскольку post-check защищает) */
+    uint64_t data_off = slot->data_offset;
+    uint32_t data_sz  = slot->data_size;
+    int64_t  pts      = slot->pts_ns;
+    int64_t  dts      = slot->dts_ns;
+    uint32_t flags    = slot->flags;
+
+    if (data_sz > out_buf_max) {
+        return CUFRAMES_ERR_INVALID_ARG; /* caller's buf too small */
+    }
+
+    /* Copy payload */
+    wraparound_memcpy_from((uint8_t *)out_buf,
+                            cuframes_pkt_data(hdr),
+                            hdr->data_size, data_off, data_sz);
+
+    /* Post-check: slot->seq не изменился во время copy. */
+    uint64_t s2 = atomic_load_explicit(&slot->seq, memory_order_acquire);
+    if (s2 != want_seq) {
+        return CUFRAMES_ERR_PACKET_OVERRUN;
+    }
+
+    *out_size  = data_sz;
+    *out_pts   = pts;
+    *out_dts   = dts;
+    *out_flags = flags;
+    *seq_inout = want_seq;
+    return CUFRAMES_OK;
+}
+
+/* ─── Cleanup ─────────────────────────────────────────────────────────── */
+
+void cuframes_internal_pkt_ring_destroy(cuframes_pkt_ring_t *ring) {
+    if (!ring) return;
+
+    if (ring->is_publisher && ring->hdr) {
+        /* Сигнализируем consumer'ам shutdown */
+        atomic_store_explicit(&ring->hdr->shutdown_flag, 1,
+                              memory_order_release);
+    }
+
+    if (ring->shm_base && ring->shm_size > 0) {
+        munmap(ring->shm_base, ring->shm_size);
+    }
+    if (ring->shm_fd >= 0) {
+        close(ring->shm_fd);
+    }
+    if (ring->is_publisher && ring->shm_name[0] != '\0') {
+        shm_unlink(ring->shm_name);
+    }
+
+    memset(ring, 0, sizeof(*ring));
+    ring->shm_fd = -1;
+}
@@ -21,7 +21,8 @@ struct cuframes_publisher {
    char                shm_name[80];

    /* CUDA */
-    cudaEvent_t         event;
+    cudaEvent_t         event;                              /* legacy single event (v0.2 compat) */
+    cudaEvent_t         slot_events[CUFRAMES_MAX_RING];     /* v0.3 — per-slot events */
    cudaIpcMemHandle_t  ipc_mem[CUFRAMES_MAX_RING];
    void               *cuda_ptrs[CUFRAMES_MAX_RING];  /* mapped pointers */
    size_t              frame_size_bytes;
@@ -41,6 +42,11 @@ struct cuframes_publisher {
    int                 accept_thread_alive;
    int                 stop_flag;
    pthread_mutex_t     state_mu;  /* protects subscriber connections */
+
+    /* v0.2 — encoded packet ring (optional). is_pkt_ring=1 → активирован. */
+    int                 has_pkt_ring;
+    uint32_t            max_packet_size;
+    cuframes_pkt_ring_t pkt_ring;
 };

 /* Forward decls */
@@ -109,13 +115,28 @@ static int register_external_pool(struct cuframes_publisher *pub,
 }

 static int create_event_handle(struct cuframes_publisher *pub) {
+    /* Legacy single event — keep для v0.2 consumer compat fallback */
    cudaError_t cerr = cudaEventCreateWithFlags(&pub->event,
        cudaEventDisableTiming | cudaEventInterprocess);
    if (cerr != cudaSuccess) {
-        CUFRAMES_LOG_ERROR("cudaEventCreateWithFlags: %s",
+        CUFRAMES_LOG_ERROR("cudaEventCreateWithFlags (legacy): %s",
                            cudaGetErrorString(cerr));
        return CUFRAMES_ERR_CUDA;
    }
+    /* v0.3 — per-slot events. Каждый publish записывает event на свой slot;
+     * consumer waits event[slot_idx] specifically — закрывает TOCTOU race
+     * (один global event может signal'ить для другого frame). */
+    for (int32_t i = 0; i < pub->ring_size_actual; i++) {
+        cerr = cudaEventCreateWithFlags(&pub->slot_events[i],
+            cudaEventDisableTiming | cudaEventInterprocess);
+        if (cerr != cudaSuccess) {
+            CUFRAMES_LOG_ERROR("cudaEventCreateWithFlags (slot %d): %s",
+                                i, cudaGetErrorString(cerr));
+            for (int32_t j = 0; j < i; j++) cudaEventDestroy(pub->slot_events[j]);
+            cudaEventDestroy(pub->event);
+            return CUFRAMES_ERR_CUDA;
+        }
+    }
    return CUFRAMES_OK;
 }

@@ -167,7 +188,7 @@ static int setup_shm(struct cuframes_publisher *pub) {
    memset(pub->hdr, 0, sizeof(cuframes_shm_header_t));

    pub->hdr->magic = CUFRAMES_MAGIC;
-    pub->hdr->proto_version = CUFRAMES_PROTOCOL_V1;
+    pub->hdr->proto_version = CUFRAMES_PROTOCOL_V3;
    pub->hdr->lib_version_major = CUFRAMES_VERSION_MAJOR;
    pub->hdr->lib_version_minor = CUFRAMES_VERSION_MINOR;
    pub->hdr->lib_version_patch = CUFRAMES_VERSION_PATCH;
@@ -187,13 +208,22 @@ static int setup_shm(struct cuframes_publisher *pub) {
    pub->hdr->meta.pitch_uv = puv;
    pub->hdr->meta.frame_size_bytes = pub->frame_size_bytes;

-    /* Export event handle */
+    /* Export event handle (legacy single) */
    cudaError_t cerr = cudaIpcGetEventHandle(&pub->hdr->ipc_event_handle, pub->event);
    if (cerr != cudaSuccess) {
        CUFRAMES_LOG_ERROR("cudaIpcGetEventHandle: %s", cudaGetErrorString(cerr));
        return CUFRAMES_ERR_CUDA;
    }
-
+    /* v0.3 — export per-slot event handles */
+    for (int32_t i = 0; i < pub->ring_size_actual; i++) {
+        cerr = cudaIpcGetEventHandle(&pub->hdr->slot_event_handles[i],
+                                      pub->slot_events[i]);
+        if (cerr != cudaSuccess) {
+            CUFRAMES_LOG_ERROR("cudaIpcGetEventHandle (slot %d): %s",
+                                i, cudaGetErrorString(cerr));
+            return CUFRAMES_ERR_CUDA;
+        }
+    }
    /* Fill slot descriptors */
    for (int i = 0; i < pub->ring_size_actual; ++i) {
        pub->hdr->slots[i].mem_handle = pub->ipc_mem[i];
@@ -402,10 +432,19 @@ int cuframes_publisher_acquire(cuframes_publisher_t *pub, void **cuda_ptr_out) {

 static int do_publish(cuframes_publisher_t *pub, int32_t slot,
                       void *stream, int64_t pts_ns) {
-    /* Record event on producer's stream */
-    cudaError_t cerr = cudaEventRecord(pub->event, (cudaStream_t)stream);
+    /* v0.3 — record per-slot event для precise consumer sync. Closes TOCTOU
+     * race где legacy `pub->event` signals "latest publish", not slot-specific. */
+    cudaError_t cerr = cudaEventRecord(pub->slot_events[slot], (cudaStream_t)stream);
    if (cerr != cudaSuccess) {
-        CUFRAMES_LOG_ERROR("cudaEventRecord: %s", cudaGetErrorString(cerr));
+        CUFRAMES_LOG_ERROR("cudaEventRecord (slot %d): %s",
+                            slot, cudaGetErrorString(cerr));
+        return CUFRAMES_ERR_CUDA;
+    }
+    /* Legacy event — keep recording для v0.2 consumer compat fallback */
+    cerr = cudaEventRecord(pub->event, (cudaStream_t)stream);
+    if (cerr != cudaSuccess) {
+        CUFRAMES_LOG_ERROR("cudaEventRecord (legacy): %s",
+                            cudaGetErrorString(cerr));
        return CUFRAMES_ERR_CUDA;
    }

@@ -504,6 +543,15 @@ int cuframes_publisher_destroy(cuframes_publisher_t *pub) {
        }
    }
    if (pub->event) cudaEventDestroy(pub->event);
+    for (int32_t i = 0; i < pub->ring_size_actual; i++) {
+        if (pub->slot_events[i]) cudaEventDestroy(pub->slot_events[i]);
+    }
+
+    /* Packet ring cleanup (если активирован) */
+    if (pub->has_pkt_ring) {
+        cuframes_internal_pkt_ring_destroy(&pub->pkt_ring);
+        pub->has_pkt_ring = 0;
+    }

    /* Unlink resources */
    if (pub->hdr) {
@@ -523,8 +571,95 @@ int cuframes_publisher_destroy(cuframes_publisher_t *pub) {
    return CUFRAMES_OK;
 }

+/* ─────────────────────────────────────────────────────────────────────── */
+/* v0.2 — encoded packet ring API (см. docs/protocol.md §10)                */
+/* ─────────────────────────────────────────────────────────────────────── */
+
+int cuframes_publisher_enable_packets(cuframes_publisher_t *pub,
+                                       const cuframes_packet_ring_options_t *opts) {
+    if (!pub) return CUFRAMES_ERR_INVALID_ARG;
+    if (pub->has_pkt_ring) return CUFRAMES_ERR_ALREADY_EXISTS;
+
+    uint32_t slots     = opts && opts->ring_slots ? opts->ring_slots
+                                                  : CUFRAMES_PKT_DEFAULT_SLOTS;
+    uint32_t data_size = opts && opts->data_size  ? opts->data_size
+                                                  : CUFRAMES_PKT_DEFAULT_DATA_SIZE;
+    uint32_t max_pkt   = opts && opts->max_packet_size ? opts->max_packet_size
+                                                       : CUFRAMES_PKT_DEFAULT_MAX_SIZE;
+    uint32_t codec_id  = opts ? opts->codec_id : 0;
+
+    if (max_pkt > data_size) {
+        CUFRAMES_LOG_ERROR("max_packet_size (%u) > data_size (%u)", max_pkt, data_size);
+        return CUFRAMES_ERR_INVALID_ARG;
+    }
+
+    int r = cuframes_internal_pkt_ring_create(pub->key, slots, data_size,
+                                               codec_id, &pub->pkt_ring);
+    if (r != CUFRAMES_OK) return r;
+
+    pub->has_pkt_ring = 1;
+    pub->max_packet_size = max_pkt;
+
+    /* Bump proto_version в frames header чтобы v2-subscribers видели поддержку. */
+    if (pub->hdr) {
+        pub->hdr->proto_version = CUFRAMES_PROTOCOL_V2;
+    }
+    return CUFRAMES_OK;
+}
+
+int cuframes_publisher_set_codec_extradata(cuframes_publisher_t *pub,
+                                            const void *extradata, size_t size) {
+    if (!pub) return CUFRAMES_ERR_INVALID_ARG;
+    if (!pub->has_pkt_ring) return CUFRAMES_ERR_NO_PACKET_RING;
+    return cuframes_internal_pkt_ring_set_extradata(&pub->pkt_ring,
+                                                     extradata, size);
+}
+
+int cuframes_publisher_publish_packet(cuframes_publisher_t *pub,
+                                       const void *data, size_t size,
+                                       int64_t pts_ns, int64_t dts_ns,
+                                       uint32_t flags) {
+    if (!pub) return CUFRAMES_ERR_INVALID_ARG;
+    if (!pub->has_pkt_ring) return CUFRAMES_ERR_NO_PACKET_RING;
+    if (size > pub->max_packet_size) return CUFRAMES_ERR_PACKET_OVERSIZED;
+    return cuframes_internal_pkt_ring_publish(&pub->pkt_ring, data, size,
+                                               pts_ns, dts_ns, flags);
+}
+
 /* ─── Accept thread + handshake ──────────────────────────────────────── */

+/* Per-subscriber lifecycle monitor — detects socket close (subscriber container
+ * exited / crashed) и освобождает bit + subscribers[] slot. Без этого каждый
+ * pipeline recreate leaks bit → bitmap overflows after 32 connections. */
+struct sub_monitor_args {
+    struct cuframes_publisher *pub;
+    int fd;
+    uint32_t bit;
+};
+
+static void *subscriber_monitor_thread(void *arg) {
+    struct sub_monitor_args *m = (struct sub_monitor_args *)arg;
+    char buf[64];
+    /* Blocking read — return 0 (EOF) когда other side close socket, или
+     * <0 on error. Любой control message (PING — TODO в будущем) just consumed. */
+    while (1) {
+        ssize_t n = recv(m->fd, buf, sizeof(buf), 0);
+        if (n <= 0) {
+            /* Subscriber dead — clear bit + slot state. */
+            atomic_fetch_and_explicit(&m->pub->hdr->subscriber_bitmap,
+                ~(1ULL << m->bit), memory_order_release);
+            atomic_store_explicit(&m->pub->hdr->subscribers[m->bit].state, 0,
+                memory_order_release);
+            close(m->fd);
+            CUFRAMES_LOG_INFO("subscriber bit=%u disconnected — freed",
+                              m->bit);
+            free(m);
+            return NULL;
+        }
+        /* future: parse control msgs (PING, UNSUBSCRIBE) here */
+    }
+}
+
 static void *accept_thread_main(void *arg) {
    struct cuframes_publisher *pub = (struct cuframes_publisher *)arg;
    while (!pub->stop_flag) {
@@ -537,14 +672,12 @@ static void *accept_thread_main(void *arg) {
            CUFRAMES_LOG_WARN("accept: %s", strerror(errno));
            continue;
        }
-        /* Synchronous handshake — после ответа socket остаётся открытым для
-         * lifetime signals (SHUTDOWN, PING). Close на error. */
+        /* Handshake — на error close socket (no monitor spawned). На success
+         * monitor thread становится owner socket'a + cleanup'ит при disconnect. */
        int r = handshake_subscriber(pub, client);
        if (r != CUFRAMES_OK) {
            close(client);
        }
-        /* TODO v0.2: track client fds для broadcast SHUTDOWN. Сейчас clients
-         * сами detect socket EOF при publisher_destroy через shutdown(). */
    }
    return NULL;
 }
@@ -661,7 +794,23 @@ static int handshake_subscriber(struct cuframes_publisher *pub, int client_fd) {

    CUFRAMES_LOG_INFO("subscriber '%s' connected (bit=%u)", name, bit);

-    /* TODO v0.2: spawn per-client thread для liveness/PING/UNSUBSCRIBE.
-     * Сейчас socket остаётся открытым на heap'е до publisher_destroy. */
+    /* Spawn detached monitor thread — owns client_fd, frees bit on socket
+     * close (subscriber container exit / crash). Без этого bitmap утекал
+     * каждый pipeline recreate. */
+    struct sub_monitor_args *m = malloc(sizeof(*m));
+    if (!m) {
+        /* OOM — fallback: leak fd, bit будет released только publisher_destroy */
+        return CUFRAMES_OK;
+    }
+    m->pub = pub;
+    m->fd = client_fd;
+    m->bit = bit;
+    pthread_t monitor_tid;
+    if (pthread_create(&monitor_tid, NULL, subscriber_monitor_thread, m) != 0) {
+        CUFRAMES_LOG_WARN("monitor pthread_create fail — bit %u may leak", bit);
+        free(m);
+    } else {
+        pthread_detach(monitor_tid);
+    }
    return CUFRAMES_OK;
 }
@@ -32,6 +32,10 @@ const char *cuframes_strerror(int err) {
        case CUFRAMES_ERR_FORMAT:         return "unsupported format or size mismatch";
        case CUFRAMES_ERR_WOULD_BLOCK:    return "would block";
        case CUFRAMES_ERR_TOO_MANY:       return "too many subscribers (max 32)";
+        case CUFRAMES_ERR_PACKET_OVERSIZED: return "packet exceeds max_packet_size";
+        case CUFRAMES_ERR_NO_PACKET_RING: return "publisher has no packet ring";
+        case CUFRAMES_ERR_NO_CODEC_PARAMS: return "codec extradata not set by publisher";
+        case CUFRAMES_ERR_PACKET_OVERRUN: return "packet ring overrun — resync on keyframe";
        case CUFRAMES_ERR_INTERNAL:       return "internal error (please report)";
        default:                          return "unknown error";
    }
@@ -83,6 +87,15 @@ int cuframes_internal_shm_name(const char *key, char *out, size_t out_size) {
    return CUFRAMES_OK;
 }

+int cuframes_internal_pkt_shm_name(const char *key, char *out, size_t out_size) {
+    int r = cuframes_internal_validate_key(key);
+    if (r != CUFRAMES_OK) return r;
+    int n = snprintf(out, out_size, "%s%s%s",
+                     CUFRAMES_SHM_PREFIX, key, CUFRAMES_PKT_SHM_SUFFIX);
+    if (n < 0 || (size_t)n >= out_size) return CUFRAMES_ERR_INVALID_ARG;
+    return CUFRAMES_OK;
+}
+
 int cuframes_internal_ensure_runtime_dir(void) {
    if (mkdir(CUFRAMES_RUNTIME_DIR, 0755) == 0) return CUFRAMES_OK;
    if (errno == EEXIST) return CUFRAMES_OK;
@@ -22,3 +22,11 @@ target_include_directories(test_stress PRIVATE
    ${CMAKE_SOURCE_DIR}/include)
 add_test(NAME stress_4consumer COMMAND test_stress)
 set_tests_properties(stress_4consumer PROPERTIES TIMEOUT 120)
+
+# v0.2 — packet ring tests (host-only, без CUDA в test-коде)
+add_executable(test_packet_ring test_packet_ring.c)
+target_link_libraries(test_packet_ring PRIVATE cuframes)
+target_include_directories(test_packet_ring PRIVATE
+    ${CMAKE_SOURCE_DIR}/include)
+add_test(NAME packet_ring_basic COMMAND test_packet_ring)
+set_tests_properties(packet_ring_basic PROPERTIES TIMEOUT 120)
@@ -0,0 +1,280 @@
+/* Stress test для encoded packet ring (v0.2).
+ *
+ * Сценарии:
+ *  1) Normal flow: 1 publisher × 1 subscriber × 2000 packets, varied sizes,
+ *     каждые 30 packets — KEY flag (имитация GOP). Subscriber проверяет:
+ *     - монотонные seq (без пропусков в этом тесте — fast consumer)
+ *     - data integrity через checksum (XOR fold)
+ *     - PTS/DTS monotonic, KEY flag доходит
+ *  2) Slow subscriber: publisher шлёт быстрее чем subscriber читает →
+ *     должен случиться OVERRUN, library resync'нет на keyframe.
+ *  3) Cleanup: после exit нет leaked SHM в /dev/shm.
+ *
+ * Без CUDA-зависимостей (packets host-side).
+ */
+#include <cuframes/cuframes.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <time.h>
+#include <unistd.h>
+
+#define KEY "test_pkt_ring"
+#define TOTAL_PACKETS 2000
+#define GOP_SIZE 30
+#define SMALL_PKT 4096
+#define LARGE_PKT (256 * 1024)
+
+#define CHECK(call) do { int _r = (call); if (_r != 0) { \
+    fprintf(stderr, "FAIL %s:%d (rc=%d): %s\n", __FILE__, __LINE__, _r, \
+        cuframes_strerror(_r)); exit(2); } } while (0)
+
+#define EXPECT_TRUE(cond) do { if (!(cond)) { \
+    fprintf(stderr, "EXPECT_TRUE failed at %s:%d: %s\n", \
+        __FILE__, __LINE__, #cond); exit(2); } } while (0)
+
+/* Сгенерировать payload: первые 8 байт = seq (little-endian), остальное pattern. */
+static void gen_payload(uint8_t *buf, size_t size, uint64_t seq) {
+    memcpy(buf, &seq, sizeof(seq));
+    for (size_t i = sizeof(seq); i < size; ++i) {
+        buf[i] = (uint8_t)((seq + i) & 0xFF);
+    }
+}
+
+/* Verify payload matches seq. Возвращает 0 если ok. */
+static int verify_payload(const uint8_t *buf, size_t size, uint64_t expected_seq) {
+    uint64_t seq_in_buf;
+    if (size < sizeof(seq_in_buf)) return -1;
+    memcpy(&seq_in_buf, buf, sizeof(seq_in_buf));
+    if (seq_in_buf != expected_seq) return -2;
+    for (size_t i = sizeof(seq_in_buf); i < size; ++i) {
+        if (buf[i] != (uint8_t)((expected_seq + i) & 0xFF)) return -3;
+    }
+    return 0;
+}
+
+static cuframes_publisher_t *make_publisher(void) {
+    cuframes_publisher_config_t cfg = {0};
+    cfg.key = KEY;
+    cfg.width = 320;
+    cfg.height = 240;
+    cfg.format = CUFRAMES_FORMAT_NV12;
+    cfg.ownership = CUFRAMES_OWNERSHIP_LIBRARY;
+    cfg.ring_size = 2;
+    cfg.policy = CUFRAMES_POLICY_DROP_OLDEST;
+    cfg.cuda_device = 0;
+    cuframes_publisher_t *pub = NULL;
+    CHECK(cuframes_publisher_create(&cfg, &pub));
+
+    cuframes_packet_ring_options_t pkt_opts = {0};
+    pkt_opts.codec_id = 27;  /* AV_CODEC_ID_H264 */
+    pkt_opts.ring_slots = 64;
+    pkt_opts.data_size = 8 * 1024 * 1024;
+    pkt_opts.max_packet_size = LARGE_PKT * 2;
+    CHECK(cuframes_publisher_enable_packets(pub, &pkt_opts));
+
+    /* Fake SPS/PPS — 16 байт */
+    uint8_t extradata[16];
+    for (int i = 0; i < 16; ++i) extradata[i] = (uint8_t)(0xAA + i);
+    CHECK(cuframes_publisher_set_codec_extradata(pub, extradata, sizeof(extradata)));
+    return pub;
+}
+
+/* Subscriber-процесс. read_delay_us позволяет имитировать slow consumer. */
+static int run_subscriber(int read_delay_us, int *out_received, int *out_overruns,
+                          int *out_first_key_seq) {
+    /* Wait чтобы publisher успел создать SHM */
+    usleep(100 * 1000);
+
+    cuframes_subscriber_config_t cfg = {0};
+    cfg.key = KEY;
+    cfg.mode = CUFRAMES_MODE_NEWEST_ONLY;
+    cfg.cuda_device = 0;
+    cfg.connect_timeout_ms = 5000;
+    cuframes_subscriber_t *sub = NULL;
+    CHECK(cuframes_subscriber_create(&cfg, &sub));
+
+    CHECK(cuframes_subscriber_enable_packets(sub));
+
+    /* Verify codec params */
+    uint32_t codec_id = 0;
+    const void *extradata = NULL;
+    size_t extradata_sz = 0;
+    int r = cuframes_subscriber_get_codec_params(sub, &codec_id, &extradata, &extradata_sz);
+    EXPECT_TRUE(r == CUFRAMES_OK);
+    EXPECT_TRUE(codec_id == 27);
+    EXPECT_TRUE(extradata_sz == 16);
+
+    int received = 0;
+    int overruns = 0;
+    int first_key_seq = -1;
+    int64_t last_pts = -1;
+    int data_errors = 0;
+
+    /* Run на ~30s или до того как publisher закончит. */
+    time_t start = time(NULL);
+    while (time(NULL) - start < 30) {
+        cuframes_packet_t *pkt = NULL;
+        int rc = cuframes_subscriber_next_packet(sub, &pkt, 500);
+        if (rc == CUFRAMES_ERR_TIMEOUT || rc == CUFRAMES_ERR_WOULD_BLOCK) {
+            if (received >= TOTAL_PACKETS / 2) break; /* достаточно для теста */
+            continue;
+        }
+        if (rc == CUFRAMES_ERR_DISCONNECTED) break;
+        if (rc == CUFRAMES_ERR_PACKET_OVERRUN) {
+            overruns++;
+            continue; /* library resync'нет на next call */
+        }
+        if (rc != CUFRAMES_OK) {
+            fprintf(stderr, "next_packet rc=%d (%s)\n", rc, cuframes_strerror(rc));
+            break;
+        }
+
+        const uint8_t *data = (const uint8_t *)cuframes_packet_data(pkt);
+        size_t size = cuframes_packet_size(pkt);
+        int64_t pts = cuframes_packet_pts(pkt);
+        uint32_t flags = cuframes_packet_flags(pkt);
+        uint64_t seq = cuframes_packet_seq(pkt);
+
+        if (verify_payload(data, size, seq) != 0) {
+            data_errors++;
+        }
+
+        if ((flags & CUFRAMES_PKT_FLAG_KEY) && first_key_seq < 0) {
+            first_key_seq = (int)seq;
+        }
+        if (pts <= last_pts && last_pts >= 0) {
+            fprintf(stderr, "PTS не монотонно: %ld <= %ld (seq=%lu)\n",
+                    pts, last_pts, seq);
+        }
+        last_pts = pts;
+        received++;
+
+        cuframes_subscriber_release_packet(sub, pkt);
+
+        if (read_delay_us > 0) usleep(read_delay_us);
+    }
+
+    EXPECT_TRUE(data_errors == 0);
+    cuframes_subscriber_destroy(sub);
+
+    *out_received = received;
+    *out_overruns = overruns;
+    *out_first_key_seq = first_key_seq;
+    return 0;
+}
+
+static void publisher_loop(int total_packets, int inter_packet_us) {
+    cuframes_publisher_t *pub = make_publisher();
+
+    /* Buffer pre-alloc — max size */
+    uint8_t *buf = (uint8_t *)malloc(LARGE_PKT);
+    EXPECT_TRUE(buf != NULL);
+
+    for (int i = 0; i < total_packets; ++i) {
+        int is_key = (i % GOP_SIZE == 0);
+        size_t size = is_key ? LARGE_PKT : SMALL_PKT + (i % 8) * 1024;
+        gen_payload(buf, size, (uint64_t)i);
+
+        int64_t pts_ns = (int64_t)i * 33333333LL; /* ~30 fps */
+        uint32_t flags = is_key ? CUFRAMES_PKT_FLAG_KEY : 0;
+        int rc = cuframes_publisher_publish_packet(pub, buf, size,
+                                                    pts_ns, pts_ns, flags);
+        if (rc != CUFRAMES_OK) {
+            fprintf(stderr, "publish rc=%d size=%zu\n", rc, size);
+        }
+        if (inter_packet_us > 0) usleep(inter_packet_us);
+    }
+    free(buf);
+    cuframes_publisher_destroy(pub);
+}
+
+static int check_no_leaked_shm(void) {
+    int fail = 0;
+    char path[256];
+    snprintf(path, sizeof(path), "/dev/shm/cuframes-%s", KEY);
+    if (access(path, F_OK) == 0) {
+        fprintf(stderr, "LEAKED %s\n", path);
+        fail = 1;
+    }
+    snprintf(path, sizeof(path), "/dev/shm/cuframes-%s-packets", KEY);
+    if (access(path, F_OK) == 0) {
+        fprintf(stderr, "LEAKED %s\n", path);
+        fail = 1;
+    }
+    return fail;
+}
+
+static int scenario_normal_flow(void) {
+    fprintf(stderr, "[scenario 1] normal flow — fast consumer\n");
+
+    pid_t pid = fork();
+    EXPECT_TRUE(pid >= 0);
+    if (pid == 0) {
+        /* child = subscriber */
+        int received = 0, overruns = 0, first_key = -1;
+        run_subscriber(0, &received, &overruns, &first_key);
+        fprintf(stderr, "  consumer: received=%d overruns=%d first_key_seq=%d\n",
+                received, overruns, first_key);
+        EXPECT_TRUE(received >= TOTAL_PACKETS / 2);
+        EXPECT_TRUE(overruns == 0);
+        EXPECT_TRUE(first_key >= 0);
+        exit(0);
+    }
+
+    /* parent = publisher (медленнее чем consumer) */
+    publisher_loop(TOTAL_PACKETS, 1000); /* 1ms между packets = 1000 fps */
+    int status = 0;
+    waitpid(pid, &status, 0);
+    EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0);
+    return 0;
+}
+
+static int scenario_slow_consumer(void) {
+    fprintf(stderr, "[scenario 2] slow consumer — must hit OVERRUN + resync\n");
+
+    pid_t pid = fork();
+    EXPECT_TRUE(pid >= 0);
+    if (pid == 0) {
+        /* child = очень медленный subscriber */
+        int received = 0, overruns = 0, first_key = -1;
+        run_subscriber(10 * 1000, &received, &overruns, &first_key); /* 10ms */
+        fprintf(stderr, "  consumer: received=%d overruns=%d first_key_seq=%d\n",
+                received, overruns, first_key);
+        /* Должны быть overruns поскольку publisher faster */
+        EXPECT_TRUE(overruns > 0);
+        /* И всё-таки что-то получили (resync работает) */
+        EXPECT_TRUE(received > 10);
+        exit(0);
+    }
+
+    /* publisher fast — 200 fps */
+    publisher_loop(TOTAL_PACKETS, 5 * 1000);
+    int status = 0;
+    waitpid(pid, &status, 0);
+    EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0);
+    return 0;
+}
+
+int main(void) {
+    signal(SIGPIPE, SIG_IGN);
+
+    scenario_normal_flow();
+    /* Ensure clean inter-test state */
+    usleep(200 * 1000);
+    if (check_no_leaked_shm()) exit(2);
+
+    scenario_slow_consumer();
+    usleep(200 * 1000);
+    if (check_no_leaked_shm()) exit(2);
+
+    fprintf(stderr, "OK — all scenarios passed\n");
+    return 0;
+}
@@ -60,6 +60,7 @@ struct Args {
    bool verbose = false;
    bool realtime = false;  // emulate -re у ffmpeg CLI: sleep по pts
    bool loop = false;      // loop input на eof (для file://)
+    bool enable_packet_ring = false;  // v0.2 — публиковать encoded packets
 };

 static void print_usage() {
@@ -75,6 +76,8 @@ static void print_usage() {
        "  --ring N            cuframes ring size (default 4, range 2..16)\n"
        "  --realtime          pace input по PTS (как ffmpeg -re; полезно для файла)\n"
        "  --loop              loop input на EOF (только для file://)\n"
+        "  --enable-packet-ring  v0.2: дополнительно публиковать encoded packets\n"
+        "                       (для consumer'ов с -c:v copy, Frigate record path)\n"
        "  --verbose           debug logs\n"
        "  -h, --help          this help\n";
 }
@@ -92,6 +95,7 @@ static int parse_args(int argc, char **argv, Args &a) {
        else if (s == "--ring") a.ring_size = std::stoi(next());
        else if (s == "--realtime") a.realtime = true;
        else if (s == "--loop") a.loop = true;
+        else if (s == "--enable-packet-ring") a.enable_packet_ring = true;
        else if (s == "--verbose") a.verbose = true;
        else if (s == "-h" || s == "--help") { print_usage(); std::exit(0); }
        else { std::cerr << "Unknown arg: " << s << "\n"; print_usage(); std::exit(1); }
@@ -235,6 +239,27 @@ int main(int argc, char **argv) {
              << "' ready, ring=" << a.ring_size
              << " pool_size=" << frame_size << " bytes/frame\n";

+    /* v0.2 — encoded packet ring (опционально). */
+    if (a.enable_packet_ring) {
+        cuframes_packet_ring_options_t pkt_opts{};
+        pkt_opts.codec_id = (uint32_t)vstream->codecpar->codec_id;
+        /* остальные поля = 0 → library использует defaults (64 slots, 8MiB, 2MiB max) */
+        pub.enable_packets(&pkt_opts);
+
+        if (vstream->codecpar->extradata_size > 0 && vstream->codecpar->extradata) {
+            pub.set_codec_extradata(vstream->codecpar->extradata,
+                                     (size_t)vstream->codecpar->extradata_size);
+            std::cerr << "[cuframes-src] packet ring active, codec_id="
+                      << vstream->codecpar->codec_id
+                      << " extradata=" << vstream->codecpar->extradata_size
+                      << " bytes\n";
+        } else {
+            std::cerr << "[cuframes-src] packet ring active, codec_id="
+                      << vstream->codecpar->codec_id
+                      << " (no extradata in stream — will rely on in-band SPS/PPS)\n";
+        }
+    }
+
    /* Stream для D2D copies */
    cudaStream_t stream;
    cudaStreamCreate(&stream);
@@ -279,6 +304,29 @@ int main(int argc, char **argv) {
            continue;
        }

+        /* v0.2 — публикуем encoded packet в packet ring ДО decoder. Это позволяет
+         * record-consumer'ам брать packet без второго RTSP-подключения к камере. */
+        if (a.enable_packet_ring) {
+            int64_t pkt_pts_ns = (pkt->pts != AV_NOPTS_VALUE)
+                ? av_rescale_q(pkt->pts, stream_tb, AVRational{1, 1000000000})
+                : cuframes::now_ns();
+            int64_t pkt_dts_ns = (pkt->dts != AV_NOPTS_VALUE)
+                ? av_rescale_q(pkt->dts, stream_tb, AVRational{1, 1000000000})
+                : pkt_pts_ns;
+            uint32_t pkt_flags = 0;
+            if (pkt->flags & AV_PKT_FLAG_KEY)        pkt_flags |= CUFRAMES_PKT_FLAG_KEY;
+            if (pkt->flags & AV_PKT_FLAG_CORRUPT)    pkt_flags |= CUFRAMES_PKT_FLAG_CORRUPT;
+#ifdef AV_PKT_FLAG_DISCONTINUITY
+            if (pkt->flags & AV_PKT_FLAG_DISCONTINUITY) pkt_flags |= CUFRAMES_PKT_FLAG_DISCONTINUITY;
+#endif
+            int prr = pub.publish_packet(pkt->data, (size_t)pkt->size,
+                                          pkt_pts_ns, pkt_dts_ns, pkt_flags);
+            if (prr != CUFRAMES_OK && a.verbose) {
+                std::cerr << "[cuframes-src] publish_packet rc=" << prr
+                          << " size=" << pkt->size << "\n";
+            }
+        }
+
        r = avcodec_send_packet(ctx, pkt);
        av_packet_unref(pkt);
        if (r < 0) continue;