Merge pull request 'Python bindings (pybind11) — Phase 0 v1' (#7 ) from feat/python-bindings into main

python: DLPack + health stats + CUDA stream + docs (tasks #199-#202)
#199 DLPack export: - frame.dlpack_y() / .dlpack_uv() — explicit multi-plane access для NV12 - frame.__dlpack__() / __dlpack_device__() — protocol для torch/cupy - Capsule deleter правильно держит refcount на frame_keep_alive, releases shape/strides arrays. CUDA pointer принадлежит frame. #200 Health/stats counters: - frames_received, timeouts, errors — per-call counters - last_seq, gap_count — proxy для drop count (NEWEST_ONLY mode) - last_frame_pts_ns - stats() — snapshot dict для MQTT health publish - counted в pybind layer т.к. C API не expose'ит ring_occupancy #201 Per-subscriber CUDA stream + thread-safety: - consumer_stream kwarg в subscribe() — int (cudaStream_t pointer) - subscriber.consumer_stream property - Thread-safety contract в docstring CuframesSubscriber - next_frame() передаёт consumer_stream_ в cuframes_subscriber_next #202 Smoke test + docs: - 10/10 pytest passed (расширен +2 теста на consumer_stream) - docs/python.md (~250 строк): quick start, API reference, integration с PyTorch/CuPy, reconnect-loop pattern, per-stream usage, pitch alignment, thread-safety, error taxonomy, backpressure, Phase 0 limitations Verify build + tests: cmake -B build-python -DBUILD_PYTHON_BINDINGS=ON cmake --build build-python -j pytest python/tests/ -v # 10/10 Закрывает Phase 0 issue gx/cuframes#6. Разблокирует goldix-smart-home/yolo-world-detector Phase 1. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-13 21:34:29 +01:00 · 2026-06-13 21:33:21 +01:00 · 2026-06-13 21:23:42 +01:00 · 2026-06-13 21:19:03 +01:00 · 2026-06-13 12:59:04 +01:00 · 2026-06-03 04:27:24 +01:00
55 changed files with 6530 additions and 506 deletions
@@ -0,0 +1,181 @@
+name: build
+
+on:
+  push:
+    branches: [main]
+    paths-ignore:
+      - '**.md'
+      - 'docs/**'
+      - 'BENCHMARKS.md'
+      - 'ROADMAP.md'
+      - 'CHANGELOG.md'
+      - 'LICENSE'
+      - '.gitea/ISSUE_TEMPLATE/**'
+  pull_request:
+    branches: [main]
+
+jobs:
+  cmake-build:
+    name: cmake build (CUDA 12.4, Ubuntu 22.04)
+    runs-on: ubuntu-22.04
+    container:
+      image: nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
+    steps:
+      # actions/checkout@v4 требует Node 20+. Ubuntu 22.04 apt даёт Node 12 — не подходит.
+      # Ставим Node 20 из NodeSource repo.
+      - name: Bootstrap Node 20 + git (для actions/checkout)
+        run: |
+          set -e
+          export DEBIAN_FRONTEND=noninteractive
+          apt-get update
+          apt-get install -y --no-install-recommends curl git ca-certificates gnupg
+          # NodeSource setup может молча упасть на slow networks (особенно через VPN
+          # на u4-runner); retry + явная verification что Node >= 18 после install.
+          for i in 1 2 3; do
+            if curl -fsSL --retry 3 --retry-delay 5 --connect-timeout 30 \
+                 https://deb.nodesource.com/setup_20.x | bash -; then
+              break
+            fi
+            echo "NodeSource setup attempt $i failed, retrying..."
+            sleep 10
+          done
+          apt-get install -y --no-install-recommends nodejs
+          NODE_VER=$(node --version)
+          echo "node: $NODE_VER"
+          # actions/checkout@v4 требует Node 20+ (ES2022 static blocks).
+          # Если NodeSource setup упал и установился Ubuntu's Node 12 — фейлим явно.
+          NODE_MAJOR=$(echo "$NODE_VER" | sed -E 's/^v([0-9]+).*/\1/')
+          if [ "$NODE_MAJOR" -lt 18 ]; then
+            echo "ERROR: Node $NODE_VER too old, NodeSource setup likely failed" >&2
+            exit 1
+          fi
+
+      - name: Install build deps
+        run: |
+          export DEBIAN_FRONTEND=noninteractive
+          apt-get install -y --no-install-recommends \
+              build-essential cmake ninja-build pkg-config \
+              libavformat-dev libavcodec-dev libavutil-dev libswscale-dev
+
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Configure (full — libcuframes + examples + tools)
+        run: |
+          cmake -B build -S . -G Ninja \
+              -DCMAKE_BUILD_TYPE=Release \
+              -DBUILD_TESTING=OFF \
+              -DBUILD_EXAMPLES=ON \
+              -DBUILD_TOOLS=ON \
+              -DBUILD_FFMPEG_FILTER=OFF \
+              -DBUILD_PYTHON_BINDINGS=OFF
+
+      - name: Build
+        run: cmake --build build --parallel
+
+      - name: Verify produced binaries + library
+        run: |
+          ls -la build/libcuframes/libcuframes.so*
+          ls -la build/libcuframes/libcuframes_static.a
+          ls -la build/tools/cuframes-rtsp-source/cuframes-rtsp-source
+          ls -la build/examples/sub_count/sub_count
+          ./build/tools/cuframes-rtsp-source/cuframes-rtsp-source --help | head -5
+
+      - name: Install + verify install layout
+        run: |
+          cmake --install build --prefix /tmp/cuframes-install
+          test -f /tmp/cuframes-install/include/cuframes/cuframes.h
+          test -f /tmp/cuframes-install/include/cuframes/cuframes.hpp
+          test -f /tmp/cuframes-install/lib/libcuframes.so
+          test -f /tmp/cuframes-install/lib/libcuframes_static.a
+
+  filter-build:
+    name: ffmpeg filter patch (out-of-tree)
+    runs-on: ubuntu-22.04
+    container:
+      image: nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
+    needs: cmake-build
+    steps:
+      - name: Bootstrap Node 20 + git (для actions/checkout)
+        run: |
+          set -e
+          export DEBIAN_FRONTEND=noninteractive
+          apt-get update
+          apt-get install -y --no-install-recommends curl git ca-certificates gnupg
+          # NodeSource setup может молча упасть на slow networks (особенно через VPN
+          # на u4-runner); retry + явная verification что Node >= 18 после install.
+          for i in 1 2 3; do
+            if curl -fsSL --retry 3 --retry-delay 5 --connect-timeout 30 \
+                 https://deb.nodesource.com/setup_20.x | bash -; then
+              break
+            fi
+            echo "NodeSource setup attempt $i failed, retrying..."
+            sleep 10
+          done
+          apt-get install -y --no-install-recommends nodejs
+          NODE_VER=$(node --version)
+          echo "node: $NODE_VER"
+          # actions/checkout@v4 требует Node 20+ (ES2022 static blocks).
+          # Если NodeSource setup упал и установился Ubuntu's Node 12 — фейлим явно.
+          NODE_MAJOR=$(echo "$NODE_VER" | sed -E 's/^v([0-9]+).*/\1/')
+          if [ "$NODE_MAJOR" -lt 18 ]; then
+            echo "ERROR: Node $NODE_VER too old, NodeSource setup likely failed" >&2
+            exit 1
+          fi
+
+      - name: Install build deps
+        run: |
+          export DEBIAN_FRONTEND=noninteractive
+          apt-get install -y --no-install-recommends \
+              build-essential cmake ninja-build pkg-config nasm \
+              libssl-dev libx264-dev libx265-dev libnuma-dev zlib1g-dev \
+              wget patch
+
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Build libcuframes (для linking в patched ffmpeg)
+        run: |
+          cmake -B build -S . -G Ninja \
+              -DCMAKE_BUILD_TYPE=Release \
+              -DBUILD_TESTING=OFF -DBUILD_EXAMPLES=OFF -DBUILD_TOOLS=OFF
+          cmake --build build --parallel
+          cmake --install build --prefix /opt/cuframes
+
+      # Clone уже-patched FFmpeg fork с локального gitea (быстро + offline).
+      # Используем ${GITHUB_SERVER_URL} — runner подставит свой view на gitea:
+      # на R9-runner = http://192.168.88.23:3222, на u4-runner = http://10.8.0.6:3222 (VPN).
+      # Hardcoded https://git.goldix.org/... не работает на u4 — нет route к public IP.
+      - name: Clone patched FFmpeg fork (local gitea mirror)
+        run: |
+          git clone --depth 1 --branch n7.1-cuframes \
+              "${GITHUB_SERVER_URL}/gx/ffmpeg-patched.git" /src/ffmpeg
+          ls /src/ffmpeg/libavformat/cuframesdec.c
+
+      - name: Configure FFmpeg (minimal + libcuframes)
+        run: |
+          cd /src/ffmpeg
+          ./configure \
+              --prefix=/opt/ffmpeg \
+              --enable-libcuframes \
+              --extra-cflags="-I/opt/cuframes/include -I/usr/local/cuda/include" \
+              --extra-ldflags="-L/opt/cuframes/lib -L/usr/local/cuda/lib64" \
+              --extra-libs="-lcudart -lpthread -lrt -lm" \
+              --disable-x86asm --disable-everything \
+              --enable-demuxer=cuframes,rawvideo \
+              --enable-decoder=rawvideo \
+              --enable-muxer=null,rawvideo \
+              --enable-protocol=file --enable-ffmpeg \
+              --disable-doc --disable-htmlpages --disable-manpages \
+              --disable-podpages --disable-txtpages
+
+      - name: Build FFmpeg
+        run: |
+          cd /src/ffmpeg
+          make -j$(nproc) ffmpeg
+
+      - name: Verify cuframes demuxer registered
+        run: |
+          export LD_LIBRARY_PATH=/opt/cuframes/lib
+          /src/ffmpeg/ffmpeg -hide_banner -formats | grep cuframes
+          /src/ffmpeg/ffmpeg -hide_banner -h demuxer=cuframes | head -10
@@ -0,0 +1,78 @@
+name: release
+
+# Триггер: push tag v* (e.g. v0.1.0, v0.2.0).
+# Сборка: runtime Docker image + source tarball, прикладываем к gitea release.
+
+on:
+  push:
+    tags:
+      - 'v*'
+
+jobs:
+  docker-runtime:
+    name: build runtime Docker image
+    runs-on: ubuntu-22.04
+    container:
+      image: docker.gitea.com/runner-images:ubuntu-22.04
+      # docker socket нужен — gitea runner монтирует /var/run/docker.sock
+      volumes:
+        - /var/run/docker.sock:/var/run/docker.sock
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Tag from ref
+        id: tag
+        run: |
+          TAG="${GITHUB_REF#refs/tags/v}"
+          echo "version=$TAG" >> $GITHUB_OUTPUT
+
+      - name: Login to gitea registry
+        run: |
+          echo "${{ secrets.GITEA_TOKEN }}" | docker login git.goldix.org \
+              -u "${{ github.actor }}" --password-stdin
+
+      - name: Build runtime image
+        run: |
+          docker build -f docker/Dockerfile.runtime \
+              -t git.goldix.org/gx/cuframes:${{ steps.tag.outputs.version }} \
+              -t git.goldix.org/gx/cuframes:latest \
+              .
+
+      - name: Push
+        run: |
+          docker push git.goldix.org/gx/cuframes:${{ steps.tag.outputs.version }}
+          docker push git.goldix.org/gx/cuframes:latest
+
+  source-tarball:
+    name: build source tarball
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Tag from ref
+        id: tag
+        run: |
+          TAG="${GITHUB_REF#refs/tags/v}"
+          echo "version=$TAG" >> $GITHUB_OUTPUT
+
+      - name: Create tarball
+        run: |
+          VERSION="${{ steps.tag.outputs.version }}"
+          mkdir -p /tmp/release
+          git archive --format=tar.gz --prefix="cuframes-$VERSION/" \
+              -o "/tmp/release/cuframes-$VERSION.tar.gz" HEAD
+          ls -la /tmp/release/
+
+      # Готовый artifact — пользователь скачает с release page либо attached к release.
+      # Gitea release upload через API делается отдельным шагом (см. gitea/release-action
+      # либо curl); тут оставляем артефакт как build output для последующего ручного
+      # attach. Для полной автоматизации — добавить шаг upload через curl + GITEA_TOKEN.
+      - name: Upload tarball as artifact
+        uses: actions/upload-artifact@v3
+        with:
+          name: cuframes-${{ steps.tag.outputs.version }}-source
+          path: /tmp/release/cuframes-*.tar.gz
@@ -0,0 +1,21 @@
+name: test-u4-runner
+
+on:
+  workflow_dispatch:
+  push:
+    paths:
+      - '.gitea/workflows/test-u4-runner.yml'
+
+jobs:
+  hello:
+    name: u4 runner smoke test
+    runs-on: u4
+    container:
+      image: ubuntu:24.04
+    steps:
+      - name: hostname + uname
+        run: |
+          echo "hostname: $(hostname)"
+          echo "uname: $(uname -a)"
+          echo "ip route: $(ip route | head -3)"
+          echo "test OK"
@@ -117,3 +117,95 @@ cd build && cmake -DBUILD_TESTING=ON ..  && cmake --build . && ctest -R stress -
 Production деplo замеры — см. интеграционные guides:
 - [docs/integration.md](docs/integration.md) — cctv-processor C++ pipeline
 - [filter/README.md](filter/README.md) — FFmpeg demuxer (Frigate setup)
+
+---
+
+## Real-world production deployment (2026-05-19, v0.2.0)
+
+**Setup**: 4 Dahua IP-камеры (HEVC main 1920×1080 / 2688×1520, 25 fps) → 3
+одновременных consumer'а на одном RTX 5090 хосте:
+- **Frigate** detect (ONNX D-FINE-S, 640×480) + record (full-res H.265 mp4)
+- **cctv-backend** custom C++ mosaic processor (composes 4×grid → RTSP output для TV)
+
+### Before → after (measured production, идентичный workload)
+
+| Метрика | Без cuframes | С cuframes v0.2 dual-input | Reduction |
+|---|---:|---:|---:|
+| **RTSP connections к камерам** | 12 (4 cam × 3 consumer) | **4** (publishers only) | **−67%** |
+| **NVDEC sessions** | ~8 (decode на каждый consumer) | **4** (publishers only) | **−50%** |
+| **Camera-side bandwidth** | ~34 Mbps (main+main+sub per cam) | **~16 Mbps** (main per cam) | **−54%** |
+| **PCIe D2H copies (consumer side)** | ~346 MB/s (decoded frames → host) | **~0** (zero-copy CUDA IPC) | **−100%** |
+| **Frigate ffmpeg с прямым RTSP** | 8 (detect+record × 4) | **0** (all через cuframes) | **−100%** |
+
+### Live nvidia-smi metrics в running system
+
+```
+GPU SM:     4-5%   (compute: detector + cuframes consumers)
+GPU NVDEC:  2-4%   (без cuframes ожидаемо было 15-25%)
+GPU NVENC:  0-1%
+```
+
+### VRAM breakdown (measured)
+
+| Component | VRAM |
+|---|---:|
+| 4× cuframes publishers (3× FHD ring + 1× 2688×1520 для LPR) | **4.4 GB** |
+| cctv-backend (composer + grid output) | 1.0 GB |
+| frigate.embeddings_manager (face + LPR ONNX models) | 1.6 GB |
+| frigate.detector:onnx (D-FINE-S COCO) | 0.6 GB |
+| **Total cuframes-stack VRAM** | **~7.7 GB** |
+
+Из них на сам cuframes accounting — только **4.4 GB** в publishers (ring buffers +
+NVDEC decode buffers). Consumers (Frigate, cctv-backend) держат свои CUDA
+contexts независимо.
+
+### Network bandwidth (real tcpdump, 10-sec sample)
+
+**31.5 Mbps** от camera subnet (4 cameras → R9), измерено через
+`tcpdump -w cam-traffic.pcap` за 10 секунд.
+
+Breakdown approximate:
+- 4 publishers × main HEVC RTP/UDP: **~16 Mbps** (cuframes core)
+- go2rtc on-demand streams (Frigate UI live preview, если открыт): **0-10 Mbps**
+- ONVIF discovery, RTSP keepalives, NTP-from-cameras: **~1-2 Mbps**
+
+Без cuframes тот же setup (cctv-backend + Frigate detect + Frigate record × 4
+camera) дал бы **~45-50 Mbps** (главное: record path забирал отдельный
+main stream от каждой camera).
+
+### Camera-side benefits
+
+Dahua/Hikvision камеры обычно cap'нуты на 4-5 одновременных RTSP streams.
+До cuframes setup (4 cam × 3 RTSP) делал каждую camera на **60-75% capacity**
+её RTSP server'а. После — **20-25%**, headroom на 2-3 дополнительных
+consumer'а без замены оборудования.
+
+### Что **сохранено** (важно)
+
+- **Качество записи**: record path через `cuframes_packets://` это **passthrough**
+  (`-c:v copy`), bit-exact original encoded stream от камеры. Frigate пишет mp4
+  с full-resolution оригинала, без re-encode.
+- **Latency**: <2 ms publisher → consumer (cuframes IPC) vs ~50-80 ms RTSP setup
+  latency для каждого нового consumer.
+- **Backward compatibility**: v0.2 publishers принимают v1 subscribers
+  (frames-only), rolling upgrade.
+
+### Hardware-agnostic projection (для другого setup)
+
+| If you have | Expected reduction |
+|---|---|
+| 16 cameras × 2 consumers | 32 → 16 NVDEC (−50%), 32 → 16 RTSP (−50%) |
+| 8 cameras × 3 consumers | 24 → 8 NVDEC (−67%), 24 → 8 RTSP (−67%) |
+| 4 cameras × 4 consumers (multi-AI pipeline) | 16 → 4 NVDEC (−75%), 16 → 4 RTSP (−75%) |
+
+Reduction масштабируется **линейно** с N (consumers per camera). v0.1 (frames
+only) сэкономит NVDEC; v0.2 (frames + packets) **дополнительно** сэкономит
+RTSP connections для record/mux consumers.
+
+### Что **НЕ** сэкономлено (честно)
+
+- **Disk space**: запись остаётся full-resolution H.265 mp4. Cuframes не сжимает.
+- **Detector inference latency**: ONNX/TensorRT detector работает на decoded
+  frames независимо от source. Cuframes только меняет где decode произошёл.
+- **Camera RTSP server CPU**: сама камера всё равно encode'ит видео. Cuframes
+  reduces **consumer-side** load, не producer-side.
@@ -5,6 +5,51 @@
 Формат основан на [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 проект следует [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

+## [0.2.0] — 2026-05-19
+
+Encoded packet ring — параллельный канал для record/mux consumer'ов
+без второго RTSP-подключения к камере.
+
+См. issue [#2](https://git.goldix.org/gx/cuframes/issues/2),
+PRs [#4](https://git.goldix.org/gx/cuframes/pulls/4) (cuframes) +
+[gx/ffmpeg-patched#1](https://git.goldix.org/gx/ffmpeg-patched/pulls/1)
+(FFmpeg demuxer).
+
+### Added
+
+- **Encoded packet ring** — параллельный ring для H.264/H.265 NAL units
+  (отдельный SHM `/dev/shm/cuframes-<key>-packets`, variable-length byte
+  buffer + slot index, seqlock-style read для защиты от overrun).
+- **Wire protocol v2** (`proto_version = 2` в SHM header). Backward-compat:
+  v2 publishers принимают v1 subscribers (frames-only).
+- **Public C API** (`include/cuframes/cuframes.h`):
+  - `cuframes_publisher_enable_packets(opts)` — активирует ring
+  - `cuframes_publisher_set_codec_extradata(data, size)` — SPS/PPS
+  - `cuframes_publisher_publish_packet(data, size, pts, dts, flags)`
+  - `cuframes_subscriber_enable_packets()` + `_next_packet()` + accessors
+  - `cuframes_subscriber_get_codec_params(codec_id, extradata, size)`
+- **`cuframes::Publisher`** (C++ RAII): `enable_packets`, `set_codec_extradata`,
+  `publish_packet` методы.
+- **`cuframes-rtsp-source`**: новый CLI flag `--enable-packet-ring`.
+  Дублирует `AVPacket` в encoded ring до передачи декодеру.
+- **FFmpeg demuxer `cuframes_packets://<key>`** (отдельная ветка
+  [gx/ffmpeg-patched PR #1](https://git.goldix.org/gx/ffmpeg-patched/pulls/1)).
+  Companion к `cuframes://`. Use case: Frigate `record` role без
+  второго RTSP к камере.
+- **4 новых error codes**: `PACKET_OVERSIZED`, `NO_PACKET_RING`,
+  `NO_CODEC_PARAMS`, `PACKET_OVERRUN`.
+- **Stress test** `libcuframes/tests/test_packet_ring.c`: 2 scenarios —
+  normal flow (1 pub × 1 sub × 2000 packets, integrity check) +
+  slow consumer (must hit OVERRUN + library auto-resync на keyframe).
+- **Protocol spec §10** в `docs/protocol.md` (397 строк): byte-exact
+  layout, seqlock semantics, late-subscriber GOP-aligned start.
+
+### Limitations (документировано)
+
+- Sub-stream selection отложено в v0.3 (`<key>-substream-<N>` naming).
+- Audio packets — v0.3 (тот же ring layout, codec_id = audio).
+- Codec change mid-stream — требует publisher destroy+recreate.
+
 ## [0.1.0] — 2026-05-17

 Первый функциональный release с production deployment.
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 3.20)
 project(cuframes
-    VERSION 0.1.0
-    DESCRIPTION "Zero-copy frame sharing via CUDA IPC"
+    VERSION 0.4.0
+    DESCRIPTION "Zero-copy frame sharing via CUDA VMM + POSIX FD"
    LANGUAGES C CXX CUDA
 )

@@ -39,3 +39,7 @@ endif()
 if(BUILD_TOOLS)
    add_subdirectory(tools/cuframes-rtsp-source)
 endif()
+
+if(BUILD_PYTHON_BINDINGS)
+    add_subdirectory(python)
+endif()
@@ -1,10 +1,15 @@
 # cuframes

+[![build](https://git.goldix.org/gx/cuframes/actions/workflows/build.yml/badge.svg?branch=main)](https://git.goldix.org/gx/cuframes/actions?workflow=build.yml)
+[![release](https://img.shields.io/badge/release-v0.1.0-blue)](https://git.goldix.org/gx/cuframes/releases/tag/v0.1.0)
+[![license](https://img.shields.io/badge/license-LGPL--2.1+-green)](LICENSE)
+
 Zero-copy sharing декодированных видеокадров между процессами через CUDA IPC.

-**Статус:** v0.1 — libcuframes готов, cuframes-rtsp-source готов, e2e-pipeline
-протестирован (4×subscriber × 2000 frames, 0 torn). FFmpeg filter — v0.2.
-**Лицензия:** LGPL-2.1+
+**Статус:** v0.1.0 released — production-deployed на multi-camera CCTV-стeке
+(Frigate + custom C++ processor, оба используют один publisher на одном NVDEC).
+См. [BENCHMARKS.md](BENCHMARKS.md) для measurements, [ROADMAP.md](ROADMAP.md)
+для v0.2 plans.

 ## Минимальные требования

@@ -59,6 +59,36 @@ ETA: 1-2 недели focused работы.
 | Frigate plugin POC (Python side, не FFmpeg) | Альтернативный путь для users которые не хотят патчить FFmpeg |
 | Docker images в public registry | Snapshot CI-built tarballs + multi-arch |

+## Future ideas 💡 (не запланированы, без ETA)
+
+Идеи которые не привязаны к конкретной версии и ждут планирования.
+
+### `gst-cuframes-src` — GStreamer source-element
+
+Аналог FFmpeg-демуксера для GStreamer-стэка. Один publisher cuframes-side → potreбители-pipeline'ы в GStreamer (DeepStream, обычный GStreamer-приложения).
+
+| Зачем | Что |
+|---|---|
+| NVIDIA DeepStream — это GStreamer-native, FFmpeg-демуксер там не работает | `gst-cuframes-src` как `GstBaseSrc`-derived element, выдаёт `GstBuffer` с `GstCudaMemory` (NVMM в Jetson вариант) |
+| GStreamer-приложения (обычный software) | Drop-in source для любой GStreamer pipeline |
+| GStreamer plugin registry | `gst-inspect-1.0 cuframessrc` discoverable |
+
+Open questions: какой memory-type — `memory:CUDAMemory` (mainline) vs `memory:NVMM` (NVIDIA DeepStream-specific). Возможно два варианта/build flags.
+
+### `vf_cuda_grid` — **выделен в отдельный продукт `gx/vf-cuda-grid`** ([repo](https://git.goldix.org/gx/vf-cuda-grid))
+
+FFmpeg filter для GPU-native video grid composition + control-plane sidecar
+(ZeroMQ/MQTT/HTTP/HA Discovery). Дизайн зафиксирован, см.
+[`gx/vf-cuda-grid` docs/design.md](https://git.goldix.org/gx/vf-cuda-grid/src/branch/main/docs/design.md)
+и [epic issue #1](https://git.goldix.org/gx/vf-cuda-grid/issues/1).
+
+Cuframes остаётся frame source provider для vf-cuda-grid в нашей экосистеме
+(но vf-cuda-grid работает и с любым другим CUDA frame source — стандартный FFmpeg).
+
+Закрывает [`gx/cctv#22`](https://git.goldix.org/gx/cctv/issues/22) Phase 4
+(end-to-end GPU pipeline для cctv-processor mosaic composer) после Phase 4 vf-cuda-grid +
+миграция cctv-processor GridComposer → vf_cuda_grid filter.
+
 ## v1.0 — Stable ABI 📋

 - Стабильный wire-protocol (minor versions add fields в reserved space)
@@ -16,7 +16,8 @@
 #     /usr/local/bin/cuframes-rtsp-source --rtsp ... --key ...

 # ─── Build stage ─────────────────────────────────────────────────────────
-FROM nvidia/cuda:13.0.3-cudnn-devel-ubuntu24.04 AS build
+# CUDA 12.4 — matching ffmpeg-vf-cuda-grid base + Frigate stable-tensorrt
+FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS build

 ENV DEBIAN_FRONTEND=noninteractive
 RUN apt-get update && apt-get install -y --no-install-recommends \
@@ -36,12 +37,13 @@ RUN cmake -B build -S . -G Ninja \
    && cmake --build build --parallel

 # ─── Runtime stage ────────────────────────────────────────────────────────
-FROM nvidia/cuda:13.0.3-cudnn-runtime-ubuntu24.04 AS runtime
+FROM nvidia/cuda:12.4.1-runtime-ubuntu22.04 AS runtime

 ENV DEBIAN_FRONTEND=noninteractive
 RUN apt-get update && apt-get install -y --no-install-recommends \
-        libavcodec60 libavformat60 libavutil58 \
+        libavcodec58 libavformat58 libavutil56 \
        ca-certificates \
+        mosquitto-clients \
    && rm -rf /var/lib/apt/lists/*

 # libcuframes.so → /usr/local/lib (стандартный путь для ldconfig)
@@ -423,3 +423,84 @@ mosaic + RTSP-server. После v1 cuframes:
 3. После Phase 0 — review результатов, корректировка дизайна (если CUDA IPC
   повёл себя не как ожидали)
 4. Phase 1+ по плану
+
+---
+
+# Appendix A — Production deployment notes (post-v0.1.0)
+
+Реальные наблюдения после первого production deployment (Frigate + cctv-processor
+на RTX 5090, 24h+ uptime). Обновляется по мере накопления опыта.
+
+## Что подтвердилось из изначального дизайна
+
+- **CUDA IPC handshake через cudaIpcEventHandle_t работает стабильно** — нет
+  ни одного torn frame за 24+ часов на 2 consumer'ах.
+- **EXTERNAL ownership** (publisher передаёт свои pre-allocated CUDA pointers)
+  необходим для FFmpeg-based publisher — иначе нужен extra cudaMemcpy из FFmpeg's
+  hwframe pool в library-managed pool.
+- **Unix socket handshake** ОК — простой, debug'абельный (`socat` для inspect).
+- **POSIX shm для header + atomic seq counters** — race-free на reader side.
+
+## Что пришлось доработать в v0.1.0 vs initial design
+
+- **CMake install rules** изначально не предусмотрены. Downstream проекты
+  делали `cmake --install` → пустой prefix. Fix: `install(TARGETS ...)` +
+  `install(DIRECTORY include/cuframes ...)`. Лессон — install rules должны
+  быть в day 1.
+- **Variable HINTS в find_library**: пользователи делают install в разные
+  prefix'ы. HINTS для downstream `find_library(cuframes)` должны включать
+  `$PREFIX/lib`, `$PREFIX/lib64`, и `build-dir/libcuframes/` для local-dev.
+
+## Что не учли в дизайне (открытые grабли — см. troubleshooting.md)
+
+### Cross-container CUDA IPC требует **shared pid + ipc namespace**
+
+`cudaIpcOpenEventHandle` validates IPC peer через `/proc/<pid>/...`. Если
+consumer container не в same PID namespace что publisher — fail с
+`invalid device context`.
+
+Это **incompatible** с s6-overlay-based containers (linuxserver.io stack,
+Frigate), требующими PID 1 для self. Workaround: только `ipc:` shared,
+accept race window (works на Frigate в практике потому что подключается
+первым после publisher restart). **Real fix planned v0.2**: socket-based
+context validation вместо `/proc` reliance.
+
+### Publisher-side resize нужен для consumers без cuda-llvm
+
+Большинство downstream FFmpeg builds — без `--enable-cuda-llvm` (на платформах
+с glibc < 2.38 эта опция не собирается, нужен `stdbit.h`). Без cuda-llvm нет
+`scale_cuda` filter. Consumer вынужден CPU-resize либо отключать hwaccel.
+
+**Fix planned v0.2**: publisher принимает `--scale=WxH` и делает GPU resize
+до publish. Consumer получает уже scaled frames, scale_cuda не нужен.
+
+### Encoded packet sharing — отсутствует в v0.1
+
+cuframes v0.1 раздаёт **только decoded** NV12. Для `record` use case
+(`-c:v copy` mux без decode) consumer всё ещё открывает свой RTSP — лимит
+камеры на concurrent streams (4-5 у Dahua) hit'ится.
+
+**v0.2 spec**: parallel encoded-packets ring + `cuframes_packets://`
+demuxer. См. [issue #2](https://git.goldix.org/gx/cuframes/issues/2).
+
+## Production setup (gold path)
+
+```
+                           ┌─► Frigate (FFmpeg cuframes:// demuxer) → detect
+Camera RTSP ─► publisher ──┤
+   (1× NVDEC)               └─► cctv-processor (CuframesSource C++ API) → motion+RTSP-encode→TV
+```
+
+| Метрика | Without cuframes (baseline) | С cuframes v0.1 |
+|---|---|---|
+| NVDEC operations на parking-камеру | 2 (Frigate detect + cctv detect) | **1** (publisher) |
+| VRAM extra cost | 0 (каждый своё) | ~3 MB (ring 6×460KB sub-stream) |
+| RTSP camera load | 2 streams | **1** stream |
+| Uptime (verified) | n/a | 24h+ без drops |
+
+## См. также
+
+- [docs/troubleshooting.md](troubleshooting.md) — конкретные грабли + fixes
+- [BENCHMARKS.md](../BENCHMARKS.md) — измерения
+- [docs/integrations/frigate.md](integrations/frigate.md) — guide для Frigate
+- [ROADMAP.md](../ROADMAP.md) — v0.2/v0.3/v1.0
@@ -1,11 +1,20 @@
 # Integration guide

-Этот guide описывает, как использовать cuframes для устранения дублирующего
-GPU-декодирования между несколькими consumer'ами одного RTSP-потока.
+Хочешь подключить cuframes к своему проекту? Выбери guide по типу integration'а:
+
+## Готовые reference guides
+
+| Тип integration'а | Guide | Reference deployment |
+|---|---|---|
+| **Frigate NVR** (через FFmpeg `cuframes://` demuxer) | [integrations/frigate.md](integrations/frigate.md) | Production: Frigate 0.17.1 + RTX 5090 + Dahua HEVC |
+| **C++ project** (через `CuframesSource` pattern) | [integrations/cctv-cpp.md](integrations/cctv-cpp.md) | Production: [gx/cctv](https://git.goldix.org/gx/cctv) C++17 processor |
+| **Python AI/ML pipeline** (через ctypes wrapper) | [examples/python-consumer/](../examples/python-consumer/) | Skeleton ready; v0.3 даст native bindings |
+| **FFmpeg-based custom tool** (своя сборка ffmpeg) | [filter/README.md](../filter/README.md) | Out-of-tree patch + build instructions |

 ## Целевой сценарий (motivation)

-В типичной CCTV-системе один и тот же RTSP-stream декодируется несколько раз:
+В типичной CCTV / video-analytics системе один и тот же RTSP-поток
+декодируется **несколько раз**:

 ```
   Камера ──► RTSP ──► Frigate          (decode #1: detection + recording)
@@ -13,13 +22,14 @@ GPU-декодирования между несколькими consumer'ами
                  ─►   AI-скрипт        (decode #3: классификация / OCR)
 ```

-На 16 камер × 25 fps × 3 consumer'а = 1200 NVDEC-операций/сек. RTX 5090 имеет
-~3 NVDEC-движка, но шина PCIe и memory bandwidth становятся узким местом.
+На 16 камер × 25 fps × 3 consumer'а = **1200 NVDEC operations/sec**. RTX 5090
+имеет ~3 NVDEC-движка с capacity ~50 FHD25 streams → загрузка близка к лимиту,
+плюс tax на PCIe bandwidth и memory.

 С cuframes:

 ```
-   Камера ──► cuframes-rtsp-source ──► CUDA frame в /dev/shm + cudaIpcEvent
+   Камера ──► cuframes-rtsp-source ──► CUDA frame в VRAM + IPC handles
                                              │
                                              ├──► Frigate          (zero-copy)
                                              ├──► mosaic-сервер    (zero-copy)
@@ -27,242 +37,36 @@ GPU-декодирования между несколькими consumer'ами
 ```

 Decode выполняется **один раз** на источник, потребители получают тот же CUDA
-device pointer без копий.
+device pointer без копий. **3× меньше NVDEC operations** на том же setup'е.

-## Текущие limitations v0.1
+## Текущие ограничения (v0.1)

- **Frigate** (по состоянию на 0.17) **не имеет** plugin-точки для приёма
-  готовых CUDA-frames. Чтобы убрать Frigate decode полностью, нужен:
-  - либо FFmpeg-filter `vf_cuda_ipc_input` (planned для cuframes v0.2 — требует
-    patch FFmpeg upstream и пересборку Frigate's bundled ffmpeg),
-  - либо Frigate-plugin (требует upstream работы с командой Frigate).
- В v0.1 практическое улучшение: **исключить decode для всех custom consumer'ов
-  кроме Frigate** (то есть cctv-processor, AI-скрипты — на cuframes; Frigate
-  остаётся как есть, со своим decode).
+- **Decoded frame sharing only** (не encoded). Для `record` path в Frigate
+  (mux без decode) consumer всё ещё открывает свой RTSP — это решит **v0.2
+  encoded packet sharing** (см. [issue #2](https://git.goldix.org/gx/cuframes/issues/2)).

-Это уже даёт значительную экономию: было 1×Frigate + N×consumer decode'ов,
-стало 1×Frigate + 1×cuframes-rtsp-source (один на все consumer'ы).
+- **NV12 frame format only**. Other formats (YUV420P, RGB) — v0.2.

-## Сценарий 1: cuframes-rtsp-source + cctv-processor (FRIGATE остаётся)
+- **GPU → CPU copy** в FFmpeg demuxer'е (`cudaMemcpy2DAsync`). Zero-copy через
+  `AVHWFramesContext` — v0.2.

-### docker-compose.yml
+- **Cross-container CUDA IPC** требует shared `ipc + pid` namespace. Если
+  consumer использует s6-overlay (как Frigate) — pid не shareable, нужен
+  workaround (см. [integrations/frigate.md](integrations/frigate.md)
+  troubleshooting).

-```yaml
-services:
-  # Один источник на камеру — публикует декодированный поток через cuframes IPC
-  cuframes-cam-parking:
-    image: gx/cuframes-rtsp-source:0.1
-    restart: unless-stopped
-    runtime: nvidia
-    environment:
-      NVIDIA_VISIBLE_DEVICES: all
-      NVIDIA_DRIVER_CAPABILITIES: compute,video,utility
-    # CRITICAL: --ipc=shareable для cross-container CUDA IPC
-    ipc: shareable
-    shm_size: 1g
-    volumes:
-      - cuframes_sock:/run/cuframes
-    command:
-      - --rtsp=rtsp://admin:${CAM_PASS}@192.168.88.98:554/cam/realmonitor?channel=1&subtype=0
-      - --key=cam-parking
-      - --ring=6
-      - --realtime  # не нужен для RTSP (real-time источник), оставлен для file://
+- **Только Linux + NVIDIA GPU** compute capability ≥ 7.5 (Turing+).

-  # Frigate (как и был — со своим decode на main+sub streams)
-  frigate:
-    image: ghcr.io/blakeblackshear/frigate:stable-tensorrt
-    # ... как обычно
+## Production reference deployments

-  # cctv-processor — подписывается на cuframes (без отдельного RTSP decode)
-  cctv-backend:
-    image: gx/cctv-processor:cuda
-    restart: unless-stopped
-    runtime: nvidia
-    # CRITICAL: shared IPC + PID namespace с publisher'ом (см. ниже)
-    ipc: container:cuframes-cam-parking
-    pid: container:cuframes-cam-parking
-    volumes:
-      - cuframes_sock:/run/cuframes:ro
-    environment:
-      # cuframes-keys для backend'а:
-      CCTV_SOURCES: cuframes:cam-parking,cuframes:cam-front-gate,...
+| Setup | Версия | Где смотреть |
+|---|---|---|
+| 1 publisher (1× NVDEC) → Frigate (detect) + cctv-backend (motion+grid→RTSP→TV) | v0.1.0 | [BENCHMARKS.md](../BENCHMARKS.md), [integrations/frigate.md](integrations/frigate.md) |

-volumes:
-  cuframes_sock:
-```
+## Roadmap для v0.2+

-**Важно — оба флага обязательны** для cross-container CUDA IPC:
+Полный roadmap — [ROADMAP.md](../ROADMAP.md). Highlights:

-| Флаг | Зачем |
-|---|---|
-| `ipc: container:<publisher>` | shared `/dev/shm` (нужен для `shm_open` под header/sockets) |
-| `pid: container:<publisher>` | CUDA driver валидирует IPC peer через `/proc/<pid>/...`; без этого `cudaIpcOpenEventHandle` падает с `invalid device context` |
-
-Альтернативы:
- Запускать consumer внутри того же container'а через `docker exec` (наследует все namespaces) — удобно для отладки.
- `--ipc=host --pid=host` — убирает namespacing вообще, но ослабляет изоляцию (не рекомендуется в production).
-
-### Изменения в cctv-processor
-
-Нужно добавить новый Source-тип (рядом с RtspSource) — `CuframesSource`:
-
-```cpp
-// cpp/apps/cctv-processor/src/sources/cuframes_source.hpp
-#include <cuframes/cuframes.hpp>
-
-class CuframesSource : public IVideoSource {
-public:
-    CuframesSource(const std::string &key) : key_(key) {
-        cuframes::SubscriberOptions opt;
-        opt.key = key;
-        opt.consumer_name = "cctv-processor";
-        opt.mode = CUFRAMES_MODE_NEWEST_ONLY;
-        sub_ = std::make_unique<cuframes::Subscriber>(opt);
-        cudaStreamCreate(&stream_);
-    }
-
-    // Вызывается processing-loop'ом
-    std::optional<GpuFrame> nextFrame() override {
-        auto f = sub_->next(stream_, 100);  // 100ms timeout
-        if (!f) return std::nullopt;
-        // cudaStreamWaitEvent уже сделан внутри next() — frame готов на stream_
-        return GpuFrame{
-            .cuda_ptr = f->cuda_ptr(),
-            .width = f->width(),
-            .height = f->height(),
-            .pitch_y = f->pitch_y(),
-            .pitch_uv = f->pitch_uv(),
-            .seq = f->seq(),
-            .pts_ns = f->pts_ns(),
-            .stream = stream_,
-            ._release = std::move(f),  // RAII release при destroy
-        };
-    }
-
-private:
-    std::string key_;
-    std::unique_ptr<cuframes::Subscriber> sub_;
-    cudaStream_t stream_;
-};
-```
-
-Конфиг `cameras.json` — добавить альтернативный source-тип:
-
-```jsonc
-{
-  "cameras": [
-    {
-      "id": "parking",
-      "source_type": "cuframes",          // вместо "rtsp"
-      "cuframes_key": "cam-parking",
-      // rtsp_url больше не нужен — он используется cuframes-rtsp-source'ом
-    }
-  ]
-}
-```
-
-## Сценарий 2: AI-скрипт на Python (subscriber)
-
-Python-bindings — в Phase 3 cuframes. Сейчас простой workaround через
-ctypes:
-
-```python
-import ctypes
-lib = ctypes.CDLL("libcuframes.so")
-# ... wrap нужные функции — см. include/cuframes/cuframes.h
-```
-
-Или: writer simple C-обёртку, которая принимает callback и публикует
-данные через ZMQ / shared memory в python-process.
-
-## Сценарий 3: Замена Frigate decode (v0.2+)
-
-Целевой сценарий — Frigate тоже подписан на cuframes. Реализуется через
-один из двух путей:
-
-### Путь A: FFmpeg filter
-
-Добавить out-of-tree filter `vf_cuda_ipc_input` который читает кадр из
-cuframes ring и эмитит AVFrame в pipeline. Frigate использует ffmpeg для
-RTSP/decode — заменяем "RTSP→decode→detect" на
-"cuframes_ipc_input→detect" (без decode'а вообще).
-
-Требования:
- Patch ffmpeg sources (libavfilter/vf_cuda_ipc_input.c + Makefile)
- Сборка кастомного Frigate-образа с patched ffmpeg
- Тестирование на совместимость с Frigate's pipeline assumptions
-
-### Путь B: Frigate plugin
-
-Engage с upstream Frigate чтобы добавить custom Source-type ("cuframes://").
-Это требует Python-API изменений в Frigate's source layer.
-
-## Verification checklist
-
-После настройки убедитесь:
-
-```bash
-# 1. Publisher запущен и socket существует
-ls -la /run/cuframes/cam-parking.sock
-ls -la /dev/shm/cuframes-cam-parking
-
-# 2. Контейнеры в одном IPC и PID namespace
-docker inspect cuframes-cam-parking cctv-backend \
-  -f '{{.Name}} ipc={{.HostConfig.IpcMode}} pid={{.HostConfig.PidMode}}'
-# Publisher: ipc=shareable pid=(default)
-# Consumer:  ipc=container:cuframes-cam-parking pid=container:cuframes-cam-parking
-
-# 3. Subscriber connect успешен
-docker exec cctv-backend /usr/local/bin/sub_count --key cam-parking --max-frames 10
-# Ожидаем:
-#   [sub_count] connected to 'cuframes-cam-parking'
-#   [sub_count] received=10 gaps=0 elapsed=0.4s avg_fps=25
-
-# 4. NVDEC utilization — должно быть N decodes, а не N*M
-nvidia-smi dmon -s u
-# Колонка %dec должна показать decode-нагрузку одного instance на камеру
-```
-
-## Troubleshooting
-
-### `Subscriber::create: timeout`
-Subscriber не нашёл publisher. Причины:
- Publisher не запущен или crashed — проверь `docker logs cuframes-cam-parking`
- Socket-файл не volumes'нут в consumer-контейнер — добавь `volumes:
-  - cuframes_sock:/run/cuframes:ro` в consumer'е
- IPC namespace не совпадает — см. checklist пункт 2
-
-### `cudaIpcOpenEventHandle: invalid device context`
-Проявляется в **отдельном** consumer-container'е после успешного handshake (socket
-открыт, header валиден, но open event handle не проходит).
-
-Причина: CUDA driver валидирует sender'а IPC peer'а через `/proc`. Если PID
-namespace не совпадает, sender невидим — context считается невалидным.
-
-Fix: добавить `pid: container:<publisher>` в consumer's compose service (рядом
-с `ipc: container:<publisher>`). Проверено на CUDA 13.0 + driver 555+.
-
-### `cudaIpcOpenMemHandle returned 'invalid device pointer'`
- Контейнеры в РАЗНЫХ ipc namespace — должны быть в одном (через
-  `ipc: container:<publisher>` или общий `ipc: shareable`)
- Subscriber работает на другом CUDA device — `--cuda-device` должен совпадать
-  у publisher и subscriber (одно и то же физическое GPU)
-
-### Высокая latency (>50ms tail)
- Subscriber slow — frames копятся в ring, по политике DROP_OLDEST они
-  пропускаются. Используй `CUFRAMES_MODE_NEWEST_ONLY` (default) — это нормально
-  для real-time системы.
- При STRICT_ORDER + STRICT_WAIT — slow consumer блокирует publisher. Не
-  рекомендуется для CCTV.
-
-### Frigate показывает чёрный экран после интеграции
- Frigate не подключён к cuframes (v0.1 — это not yet supported). В v0.1
-  Frigate должен оставаться на своём RTSP decode (см. Сценарий 1).
-
-## Roadmap
-
- **v0.1** (текущая): standalone publisher/subscriber, C/C++ API, examples.
- **v0.2**: FFmpeg filter `vf_cuda_ipc_input` (out-of-tree), Python bindings.
- **v0.3**: NVENC-bridge для re-encode подписчиков, Frigate plugin
-  proof-of-concept.
- **v1.0**: stable ABI, multi-GPU, documented Frigate integration.
+- **v0.2**: encoded packet sharing (Frigate record без второго RTSP), FFmpeg upstream PR, publisher-side resize для устранения scale_cuda dependency
+- **v0.3**: pybind11 Python bindings, Jetson/arm64 support
+- **v1.0**: stable ABI, multi-GPU, env-based credentials
@@ -0,0 +1,309 @@
+# C++ project integration (cctv-processor pattern)
+
+Reference guide на основе реального production deployment
+([gx/cctv](https://git.goldix.org/gx/cctv) — C++17 video processor).
+
+## Use case
+
+Custom video pipeline (motion detection, mosaic compose, encode-out, snapshots),
+получает кадры с N камер и выполняет per-frame processing. Без cuframes:
+один RTSP+NVDEC на каждую камеру **внутри** processor + дублирующий decode
+если Frigate/AI script тоже подключены к той же камере.
+
+С cuframes: processor подписывается на published frames, **никакого RTSP / NVDEC**
+у него — все консьюмеры используют один decode от publisher'а.
+
+## Архитектурный паттерн
+
+Выделить **interface** `IFrameSource` чтобы pipeline не зависел от конкретного
+источника (RTSP vs cuframes vs тестовый file).
+
+```cpp
+// include/sources/IFrameSource.h
+namespace cctv::sources {
+
+enum class ConnectionState {
+    DISCONNECTED, CONNECTING, CONNECTED, RECONNECTING, ERROR
+};
+
+struct StreamInfo {
+    int width = 0;
+    int height = 0;
+    double fps = 0.0;
+    std::string codec_name;
+    int64_t bitrate = 0;
+};
+
+class IFrameSource {
+public:
+    using FrameCallback = std::function<void(const cv::Mat& frame, int64_t ts_ms)>;
+    using StateCallback = std::function<void(ConnectionState, const std::string&)>;
+
+    virtual ~IFrameSource() = default;
+    virtual bool connect(const std::string& url) = 0;
+    virtual void disconnect() = 0;
+    virtual bool isConnected() const = 0;
+    virtual void setFrameCallback(FrameCallback) = 0;
+    virtual void setStateCallback(StateCallback) = 0;
+    virtual void setReconnectEnabled(bool) = 0;
+    virtual StreamInfo getStreamInfo() const = 0;
+    virtual ConnectionState getState() const = 0;
+    virtual std::string getLastError() const = 0;
+    virtual uint64_t getFramesReceived() const = 0;
+    virtual uint64_t getFramesDropped() const = 0;
+    virtual double getCurrentFPS() const = 0;
+};
+
+}  // namespace cctv::sources
+```
+
+`RTSPClient` (legacy) и `CuframesSource` оба implement `IFrameSource`. Pipeline
+работает с `unique_ptr<IFrameSource>` — code не знает, RTSP это или cuframes.
+
+## CuframesSource — реализация
+
+```cpp
+// include/sources/CuframesSource.h
+#include "sources/IFrameSource.h"
+
+// Forward-declare — не утекают в header
+struct cuframes_subscriber;
+typedef struct cuframes_subscriber cuframes_subscriber_t;
+
+namespace cctv::sources {
+
+class CuframesSource : public IFrameSource {
+public:
+    CuframesSource();
+    ~CuframesSource() override;
+
+    // IFrameSource: URL для cuframes — это просто `key` (либо "cuframes://<key>")
+    bool connect(const std::string& url) override;
+    void disconnect() override;
+    // ... остальные методы (см. полный файл в gx/cctv repo)
+
+    void setCudaDevice(int device);
+    void setReconnectInterval(int seconds);
+
+private:
+    void workerThread();
+    bool openSubscriber();
+    void closeSubscriber();
+
+    std::string m_key;
+    int m_cudaDevice = 0;
+    cuframes_subscriber_t* m_sub = nullptr;
+    void* m_cudaStream = nullptr;        // cudaStream_t, opaque
+    void* m_hostBuffer = nullptr;        // pinned host buffer для NV12
+    size_t m_hostBufferSize = 0;
+    std::thread m_thread;
+    std::atomic<bool> m_shouldStop{false};
+    // ... callbacks, state, stats
+};
+
+}  // namespace cctv::sources
+```
+
+### Worker thread (core)
+
+```cpp
+void CuframesSource::workerThread() {
+    while (!m_shouldStop.load()) {
+        if (!m_sub) {
+            if (!openSubscriber()) {
+                changeState(ConnectionState::RECONNECTING, m_lastError);
+                if (!m_reconnectEnabled) return;
+                sleep_for(seconds(m_reconnectInterval));
+                continue;
+            }
+            changeState(ConnectionState::CONNECTED, "");
+        }
+
+        cuframes_frame_t* frame = nullptr;
+        int rc = cuframes_subscriber_next(m_sub, m_cudaStream, &frame, 200);
+
+        if (rc == CUFRAMES_ERR_TIMEOUT) continue;
+        if (rc == CUFRAMES_ERR_DISCONNECTED) {
+            closeSubscriber();
+            changeState(ConnectionState::RECONNECTING, "publisher disconnected");
+            continue;
+        }
+        if (rc != CUFRAMES_OK || !frame) {
+            LOG_ERROR("cuframes next: " + std::string(cuframes_strerror(rc)));
+            closeSubscriber();
+            continue;
+        }
+
+        // Frame metadata
+        int32_t w, h;
+        cuframes_frame_size(frame, &w, &h);
+        const int32_t pitch_y  = cuframes_frame_pitch_y(frame);
+        const int32_t pitch_uv = cuframes_frame_pitch_uv(frame);
+        const int64_t pts_ns   = cuframes_frame_pts_ns(frame);
+
+        // Ensure host buffer big enough
+        const size_t need = (size_t)w * h * 3 / 2;  // NV12 packed
+        if (need > m_hostBufferSize) {
+            cudaFreeHost(m_hostBuffer);
+            cudaMallocHost(&m_hostBuffer, need);
+            m_hostBufferSize = need;
+        }
+
+        // Copy GPU NV12 → host NV12 (Y plane + UV plane)
+        uint8_t* cu = (uint8_t*)cuframes_frame_cuda_ptr(frame);
+        cudaMemcpy2DAsync(m_hostBuffer, w, cu, pitch_y,
+                          w, h, cudaMemcpyDeviceToHost, m_cudaStream);
+        cudaMemcpy2DAsync((uint8_t*)m_hostBuffer + (size_t)w*h, w,
+                          cu + (size_t)pitch_y*h, pitch_uv,
+                          w, h/2, cudaMemcpyDeviceToHost, m_cudaStream);
+        cudaStreamSynchronize(m_cudaStream);
+
+        // Release frame BEFORE downstream processing — publisher может переиспользовать slot
+        cuframes_subscriber_release(m_sub, frame);
+
+        // NV12 → BGR (CPU) — downstream pipeline ожидает cv::Mat BGR
+        cv::Mat nv12(h * 3 / 2, w, CV_8UC1, m_hostBuffer);
+        cv::Mat bgr;
+        cv::cvtColor(nv12, bgr, cv::COLOR_YUV2BGR_NV12);
+
+        // Доставка через callback
+        if (m_frameCallback) m_frameCallback(bgr, pts_ns / 1000000);
+    }
+    closeSubscriber();
+}
+```
+
+`cudaMemcpy → CPU → cv::cvtColor` это v0.1 path. **Zero-copy** через
+`AVHWFramesContext` / OpenCV cv::cuda::GpuMat — planned v0.2.
+
+## Factory pattern (per-camera)
+
+```cpp
+// В StreamProcessor::initializeComponents()
+for (const auto& camera : cameras) {
+    if (!camera.enabled) continue;
+
+    std::unique_ptr<sources::IFrameSource> source;
+    if (camera.source_type == "cuframes") {
+        source = std::make_unique<sources::CuframesSource>();
+    } else {
+        source = std::make_unique<rtsp::RTSPClient>();  // legacy RTSP
+    }
+
+    source->setFrameCallback([this, id = camera.id](const cv::Mat& frame, int64_t ts) {
+        m_videoProcessor->processFrame(id, frame);
+    });
+    source->setStateCallback([this, id = camera.id](auto state, const std::string& msg) {
+        // logging, alerting, watchdog
+    });
+    source->setReconnectEnabled(true);
+
+    m_frameSources[camera.id] = std::move(source);
+}
+```
+
+В `start()` — отдельный цикл:
+
+```cpp
+for (const auto& camera : cameras) {
+    if (!camera.enabled) continue;
+    auto& src = m_frameSources[camera.id];
+    const std::string url = (camera.source_type == "cuframes")
+        ? camera.cuframes_key
+        : camera.rtsp_url;
+    src->connect(url);
+}
+```
+
+## CMake integration
+
+```cmake
+# cmake/Dependencies.cmake
+if(ENABLE_CUDA AND CUDA_AVAILABLE)
+    find_path(CUFRAMES_INCLUDE_DIR cuframes/cuframes.h
+        HINTS ${CUFRAMES_ROOT}/include /usr/local/include /usr/include
+    )
+    find_library(CUFRAMES_LIBRARY cuframes
+        HINTS ${CUFRAMES_ROOT}/lib ${CUFRAMES_ROOT}/lib64
+              /usr/local/lib /usr/lib
+    )
+    if(CUFRAMES_INCLUDE_DIR AND CUFRAMES_LIBRARY)
+        set(CUFRAMES_FOUND TRUE)
+        find_package(CUDAToolkit REQUIRED)
+        message(STATUS "cuframes: FOUND (${CUFRAMES_LIBRARY})")
+    else()
+        message(STATUS "cuframes: NOT FOUND (camera source_type=cuframes недоступен)")
+    endif()
+endif()
+```
+
+```cmake
+# apps/your-processor/CMakeLists.txt
+if(CUFRAMES_FOUND)
+    target_include_directories(your-processor PRIVATE ${CUFRAMES_INCLUDE_DIR})
+    target_link_libraries(your-processor PRIVATE ${CUFRAMES_LIBRARY} CUDA::cudart)
+    target_compile_definitions(your-processor PRIVATE CCTV_HAVE_CUFRAMES=1)
+endif()
+```
+
+`CuframesSource.cpp` оборачивается в `#ifdef CCTV_HAVE_CUFRAMES` — без cuframes
+в системе фабрика возвращает error при `source_type == "cuframes"`, остальное
+компилируется как обычно.
+
+## Config
+
+`cameras.json` extension:
+
+```json
+{
+  "cameras": [
+    {
+      "id": 1,
+      "name": "Парковка через cuframes",
+      "source_type": "cuframes",
+      "cuframes_key": "cam-parking",
+      "rtsp_url": "",
+      "enabled": true,
+      "motion_detection": { "enabled": false, ... }
+    },
+    {
+      "id": 2,
+      "name": "Камера на RTSP",
+      "rtsp_url": "rtsp://admin:pw@cam-ip:554/stream",
+      "enabled": true
+    }
+  ]
+}
+```
+
+## Runtime requirements
+
+Consumer container/process должен:
+1. Иметь доступ к `/run/cuframes` (volume mount от publisher'а).
+2. Быть в **same** IPC namespace (для `/dev/shm` shared) — `ipc: container:<publisher>`.
+3. Быть в **same** PID namespace (для CUDA driver IPC validation) — `pid: container:<publisher>` (если consumer не имеет PID-1-strict init типа s6-overlay).
+4. Иметь NVIDIA runtime — `runtime: nvidia` в compose.
+5. Запускаться с правом доступа к socket (по умолчанию root) — `user: root` в compose.
+
+Пример compose service:
+
+```yaml
+your-cctv-backend:
+  image: your-image:cuda
+  runtime: nvidia
+  user: root  # socket в publisher container root-owned
+  ipc: "container:cuframes-pub-parking"
+  pid: "container:cuframes-pub-parking"  # если ваш image не использует s6
+  environment:
+    NVIDIA_VISIBLE_DEVICES: all
+    NVIDIA_DRIVER_CAPABILITIES: compute,video,utility
+  volumes:
+    - cuframes_sock:/run/cuframes:ro
+```
+
+## См. также
+
+- [filter/README.md](../../filter/README.md) — FFmpeg demuxer (если ваш processor построен на FFmpeg)
+- [docs/integrations/frigate.md](frigate.md) — Frigate-specific guide
+- [docs/architecture.md](../architecture.md) — внутренности CUDA IPC
+- [Полный код CuframesSource](https://git.goldix.org/gx/cctv/src/branch/enterprise/develop/cpp/apps/cctv-processor/src/sources/CuframesSource.cpp) — реальный production-tested файл
@@ -0,0 +1,364 @@
+# Frigate integration
+
+Полный production-tested guide для интеграции cuframes с
+[Frigate NVR](https://github.com/blakeblackshear/frigate). На основе реального
+deployment (Frigate 0.17.1-tensorrt + RTX 5090 + Dahua HEVC камеры).
+
+## Что вы получаете
+
+- **Один NVDEC decode на камеру** вместо одного у Frigate + одного у каждого
+  другого consumer'а (cctv-processor, AI-скрипт, mosaic-сервер).
+- Frigate видит decoded frames через **обычный FFmpeg URL** — никакого fork'а
+  Frigate-кода. Frigate сам не подозревает что под капотом cuframes.
+
+## Что вы НЕ получаете в v0.1
+
+- **Record path** (`-c:v copy` для архива) — этот path в Frigate всё ещё через
+  свой отдельный RTSP. v0.2 cuframes решит это через encoded packet sharing
+  (см. [issue #2](https://git.goldix.org/gx/cuframes/issues/2)).
+- Hwaccel CUDA filters для detect resize (`scale_cuda`) — наш minimal FFmpeg
+  собран без `--enable-cuda-llvm` (не работает на glibc < 2.38 что у Debian 12,
+  на котором Frigate base). Workaround: `hwaccel_args: []` в config → CPU
+  scale (cost ~5-10% CPU на FHD25).
+
+## Архитектура
+
+```
+Camera RTSP ──► cuframes-rtsp-source ──► [NVDEC ─► NV12 in CUDA IPC]
+                                              │
+                                              ├──► Frigate (ffmpeg -f cuframes) → detect
+                                              ├──► cctv-processor (CuframesSource) → motion+mosaic
+                                              └──► AI-script (Python ctypes) → inference
+```
+
+## Требования
+
+| | Минимум | Note |
+|---|---|---|
+| NVIDIA driver | 555+ | для CUDA 12 runtime |
+| CUDA Toolkit (для build patched FFmpeg) | 12.4+ | host или builder container |
+| GPU compute capability | ≥ 7.5 | требование CUDA IPC |
+| OS на target (Frigate runtime) | Debian 12 bookworm | glibc 2.36 — это база Frigate `stable-tensorrt` |
+| OS на builder | Ubuntu 22.04 (glibc 2.35) | forward-compat с Debian 12 |
+| docker buildx | latest | для multi-stage build |
+
+## Шаг 1 — Build patched Frigate image
+
+Cuframes integration требует patched FFmpeg внутри Frigate с `cuframes://`
+demuxer. Самый простой путь — собрать overlay image поверх existing Frigate.
+
+### 1.1. Минимальный Dockerfile (Debian 12 builder + custom FFmpeg)
+
+```dockerfile
+# Build patched FFmpeg на Debian 12 (glibc-совместимо с Frigate runtime)
+FROM debian:bookworm AS builder
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential cmake git nasm pkg-config ca-certificates wget patch ninja-build \
+    libssl-dev libx264-dev libx265-dev libnuma-dev zlib1g-dev \
+    libfreetype-dev libfribidi-dev libharfbuzz-dev libfontconfig-dev \
+    libvpx-dev libopus-dev libmp3lame-dev libvorbis-dev libtheora-dev libwebp-dev \
+    libaom-dev libdav1d-dev libsvtav1enc-dev \
+    libssh-dev librist-dev libsrt-openssl-dev \
+    libdrm-dev libva-dev libxcb1-dev \
+ && rm -rf /var/lib/apt/lists/*
+
+# CUDA toolkit 12.x
+RUN wget -q https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/cuda-keyring_1.1-1_all.deb \
+ && dpkg -i cuda-keyring_1.1-1_all.deb && rm cuda-keyring_1.1-1_all.deb \
+ && apt-get update && apt-get install -y --no-install-recommends cuda-toolkit-12-6 \
+ && rm -rf /var/lib/apt/lists/*
+ENV PATH=/usr/local/cuda/bin:$PATH
+
+# nv-codec-headers (для FFmpeg ffnvcodec/nvenc/nvdec)
+RUN git clone --depth 1 --branch n12.2.72.0 https://github.com/FFmpeg/nv-codec-headers.git /tmp/nvc \
+ && make -C /tmp/nvc install && rm -rf /tmp/nvc
+
+# Build libcuframes (static install в /opt/cuframes)
+RUN git clone --depth 1 https://git.goldix.org/gx/cuframes.git /src/cuframes \
+ && cmake -B /src/cuframes/build -S /src/cuframes -G Ninja \
+        -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTING=OFF \
+        -DBUILD_EXAMPLES=OFF -DBUILD_TOOLS=OFF \
+ && cmake --build /src/cuframes/build -j"$(nproc)" \
+ && cmake --install /src/cuframes/build --prefix /opt/cuframes
+
+# Clone patched FFmpeg fork (либо upstream + apply patch — см. filter/README.md)
+RUN git clone --depth 1 --branch n7.1-cuframes \
+        https://git.goldix.org/gx/ffmpeg-patched.git /src/ffmpeg
+
+# Configure (minimal-but-functional для Frigate)
+RUN cd /src/ffmpeg && ./configure \
+        --prefix=/opt/ffmpeg \
+        --enable-gpl --enable-version3 --enable-nonfree \
+        --enable-libcuframes \
+        --enable-libx264 --enable-libx265 \
+        --enable-libvpx --enable-libopus --enable-libmp3lame \
+        --enable-libvorbis --enable-libtheora --enable-libwebp \
+        --enable-libaom --enable-libdav1d --enable-libsvtav1 \
+        --enable-libfreetype --enable-libfribidi --enable-libharfbuzz \
+        --enable-libssh --enable-librist --enable-libsrt \
+        --enable-openssl \
+        --enable-ffnvcodec --enable-cuvid --enable-nvenc --enable-nvdec \
+        --extra-cflags="-I/opt/cuframes/include -I/usr/local/cuda/include" \
+        --extra-ldflags="-L/opt/cuframes/lib -L/usr/local/cuda/lib64" \
+        --extra-libs="-lcudart -lpthread -lrt -lm" \
+        --disable-doc --disable-htmlpages --disable-manpages
+RUN cd /src/ffmpeg && make -j"$(nproc)" && make install
+
+# ─── Runtime: Frigate + наши binaries поверх ──────────────────────────
+FROM ghcr.io/blakeblackshear/frigate:stable-tensorrt
+
+# Missing dynamic .so которые требует наш patched ffmpeg (Frigate image их не имеет —
+# bundled статически собран без них в DT_NEEDED)
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        libharfbuzz0b libfribidi0 librist4 libsrt1.5-openssl libssh-4 \
+        libvpx7 libwebpmux3 libwebp7 libdav1d6 libaom3 libmp3lame0 \
+        libsvtav1enc1 libtheora0 libvorbis0a libvorbisenc2 \
+        libx264-164 libx265-199 libopus0 \
+ && rm -rf /var/lib/apt/lists/*
+
+# Replace bundled ffmpeg (оригинал backup'нем под .orig)
+RUN cp /usr/lib/ffmpeg/7.0/bin/ffmpeg  /usr/lib/ffmpeg/7.0/bin/ffmpeg.orig \
+ && cp /usr/lib/ffmpeg/7.0/bin/ffprobe /usr/lib/ffmpeg/7.0/bin/ffprobe.orig
+COPY --from=builder /opt/ffmpeg/bin/ffmpeg  /usr/lib/ffmpeg/7.0/bin/ffmpeg
+COPY --from=builder /opt/ffmpeg/bin/ffprobe /usr/lib/ffmpeg/7.0/bin/ffprobe
+COPY --from=builder /opt/cuframes/lib/libcuframes.so.0.1.0 /usr/local/lib/
+RUN cd /usr/local/lib && ln -sf libcuframes.so.0.1.0 libcuframes.so.0 \
+ && ln -sf libcuframes.so.0 libcuframes.so && ldconfig
+
+# Build-time smoke: ldd resolved + cuframes demuxer registered
+RUN ldd /usr/lib/ffmpeg/7.0/bin/ffmpeg | grep -q "not found" && exit 1 || true
+RUN /usr/lib/ffmpeg/7.0/bin/ffmpeg -hide_banner -formats | grep -q cuframes \
+ && echo "OK: cuframes demuxer registered in Frigate image"
+```
+
+Build:
+```bash
+docker build -t local/frigate-cuframes:latest -f Dockerfile.frigate .
+```
+
+Размер ~10 GB (наследует Frigate `stable-tensorrt` ~9 GB).
+
+## Шаг 2 — docker-compose: publisher + Frigate
+
+```yaml
+services:
+  # Один publisher на камеру — единственный source RTSP, делает 1× NVDEC.
+  cuframes-pub-parking:
+    image: git.goldix.org/gx/cuframes:0.1   # либо local build из filter/Dockerfile.runtime
+    container_name: cuframes-pub-parking
+    restart: unless-stopped
+    runtime: nvidia
+    # CRITICAL: ipc=shareable — Frigate и другие consumers подсоединяются через
+    # ipc: container:cuframes-pub-parking
+    ipc: shareable
+    shm_size: 256m
+    environment:
+      NVIDIA_VISIBLE_DEVICES: all
+      NVIDIA_DRIVER_CAPABILITIES: compute,video,utility
+    volumes:
+      - cuframes_sock:/run/cuframes
+    command:
+      - /usr/local/bin/cuframes-rtsp-source
+      - --rtsp
+      - "rtsp://admin:${CAM_PASS}@cam-parking-ip:554/cam/realmonitor?channel=1&subtype=1"
+      - --key
+      - cam-parking
+      - --ring
+      - "6"
+      - --verbose
+
+  frigate:
+    image: local/frigate-cuframes:latest
+    container_name: frigate
+    restart: unless-stopped
+    depends_on:
+      cuframes-pub-parking:
+        condition: service_started
+    runtime: nvidia
+    privileged: true
+    shm_size: 512m
+    # CUDA IPC c publisher'ом: shared /dev/shm
+    # WARN: pid намерено НЕ share'ится — Frigate использует s6-overlay,
+    # которое требует PID 1 в своём namespace.
+    ipc: "container:cuframes-pub-parking"
+    environment:
+      FRIGATE_RTSP_PASSWORD: "${FRIGATE_RTSP_PASSWORD}"
+      NVIDIA_VISIBLE_DEVICES: all
+      NVIDIA_DRIVER_CAPABILITIES: compute,video,utility
+    ports:
+      - "5000:5000"
+      - "8971:8971"
+    volumes:
+      - cuframes_sock:/run/cuframes:ro
+      - ./config/config.yml:/config/config.yml:ro
+      - /home/user/frigate-media:/media/frigate
+      # ... остальные volumes как обычно
+
+volumes:
+  cuframes_sock:
+```
+
+## Шаг 3 — Frigate config.yml
+
+Ключевые отличия от стандартного config:
+
+```yaml
+ffmpeg:
+  # ВАЖНО: hwaccel cuda отключаем (наш ffmpeg без cuda-llvm → нет scale_cuda).
+  # Detect-path использует CPU scale, но decode уже done у publisher'а.
+  hwaccel_args: []
+  output_args:
+    record: preset-record-generic-audio-aac
+
+cameras:
+  parking_overview:
+    enabled: true
+    ffmpeg:
+      inputs:
+        # main (full-res) — только запись в архив через прямой RTSP
+        # (decode у Frigate НЕ происходит — это `-c:v copy` мux)
+        - path: rtsp://admin:${FRIGATE_RTSP_PASSWORD}@cam-parking-ip:554/cam/realmonitor?channel=1&subtype=0
+          roles: [record]
+
+        # sub-stream → через cuframes (decoded у publisher'а, без второго NVDEC у Frigate)
+        - path: cuframes://cam-parking
+          input_args: -f cuframes
+          roles: [detect]
+    detect:
+      width: 640
+      height: 480
+      fps: 5
+```
+
+После v0.2 cuframes (encoded packet sharing) record-path тоже мoжет
+переключиться на `cuframes_packets://cam-parking` — тогда **никакого RTSP в
+Frigate config'е вообще**.
+
+## Шаг 4 — Run + verify
+
+```bash
+docker compose up -d
+docker logs -f frigate
+```
+
+Что искать в logs:
+- `[INFO] Camera processor started for parking_overview` — normal startup
+- НЕТ `[ERROR] Ffmpeg process crashed` — если есть, посмотри
+  [Troubleshooting](#troubleshooting)
+- В `nvidia-smi dmon -s u` колонка `%dec` должна показывать ~1-2% на одну
+  камеру (это publisher), Frigate сам не decode'ит cuframes input
+
+```bash
+# Проверить что Frigate реально читает cuframes:
+docker exec frigate ps -ef | grep ffmpeg | grep cuframes
+# Должна быть линия вида:
+#   ffmpeg ... -f cuframes -i cuframes://cam-parking -r 5 -vf fps=5,scale=640:480 ...
+```
+
+## Troubleshooting
+
+### `s6-overlay-suexec: fatal: can only run as pid 1`
+
+Появляется если попытались добавить `pid: container:cuframes-pub-parking` в
+Frigate service. Frigate's s6-overlay strict требует PID 1.
+
+**Fix**: убрать `pid:` из compose. Если только `ipc:` shared — большинство
+случаев работают (Frigate подсоединяется первым и его CUDA context служит
+для последующих).
+
+**Альтернатива**: запустить Frigate с собственным namespace но дублировать
+publisher socket через bind-mount. Frigate сам управляется first CUDA context.
+
+### `[AVFilterGraph] No such filter: 'scale_cuda'`
+
+Frigate config имеет `hwaccel_args: preset-nvidia` (default). Наш patched
+ffmpeg собран без `--enable-cuda-llvm` (не работает на glibc < 2.38). Эта
+опция компилирует CUDA filters, включая `scale_cuda`.
+
+**Fix**: `hwaccel_args: []` в config.yml. CPU scale (5-10% CPU per FHD25 камера).
+
+**Real fix** (planned): cuframes v0.2 — publisher сам делает resize до detect-size
+и публикует pre-scaled frames. Тогда Frigate не нуждается в scale_cuda.
+
+### `cudaIpcOpenEventHandle: invalid device context`
+
+Consumer container не имеет shared pid namespace с publisher'ом → CUDA driver
+не валидирует IPC peer.
+
+**Fix для cross-container CUDA IPC**: `pid: container:<publisher>` + `ipc:
+container:<publisher>`. Для Frigate этот fix недоступен (см. предыдущий пункт).
+Workaround — поднять Frigate первым после publisher (race window) или использовать
+encoded packet path (v0.2).
+
+### `Nonmatching transport in server reply` от RTSP-output Frigate
+
+Не относится к cuframes — это нормальное поведение Frigate's go2rtc для
+TCP transport. TV/VLC обычно использует UDP — оно работает.
+
+## v0.2: dual-input (detect + record через один RTSP)
+
+После cuframes v0.2 publisher активирует **encoded packet ring** параллельно
+с decoded frames ring. Это даёт Frigate одновременно:
+
+- `cuframes://<key>` — **decoded NV12** для `detect` role (как в v0.1)
+- `cuframes_packets://<key>` — **encoded H.264/H.265** для `record` role
+  (passthrough, без decode)
+
+→ **1 RTSP connection** к камере вместо 2-3 (Frigate сейчас открывает
+отдельный stream для record).
+
+### Setup
+
+```bash
+cuframes-rtsp-source \
+  --rtsp rtsp://admin:pw@192.168.88.98/cam/realmonitor?channel=1 \
+  --key cam-parking \
+  --enable-packet-ring
+```
+
+Publisher держит **два** SHM:
+- `/dev/shm/cuframes-cam-parking`         (decoded NV12, v0.1)
+- `/dev/shm/cuframes-cam-parking-packets` (encoded packets, v0.2)
+
+### Frigate config
+
+```yaml
+cameras:
+  cam_parking:
+    ffmpeg:
+      inputs:
+        - path: cuframes://cam-parking
+          input_args: -f cuframes
+          roles: [detect]
+        - path: cuframes_packets://cam-parking
+          input_args: -f cuframes_packets
+          roles: [record]
+```
+
+### Requirements
+
+- Patched FFmpeg с обоими demuxer'ами:
+  [gx/ffmpeg-patched PR #1](https://git.goldix.org/gx/ffmpeg-patched/pulls/1).
+- Frigate Dockerfile перекомпилирован с этим ffmpeg (см. секцию выше про
+  `cuframes-frigate:0.17` build).
+
+### Trade-offs
+
+| Метрика | v0.1 (frames only) | v0.2 (frames + packets) |
+|---|---|---|
+| RTSP к камере | 1 (publisher) | 1 (publisher) |
+| Frigate-side RTSP | 1+ (record отдельно) | **0** — всё через cuframes |
+| Camera RTSP streams | 2+ | **1** |
+| Доп. VRAM | ring (~10 MB) | без изменений |
+| Доп. host RAM | минимум | + 8 MB на packet ring |
+| Доп. CPU | nominal | nominal (memcpy в shared ring) |
+
+## См. также
+
+- [filter/README.md](../../filter/README.md) — детали FFmpeg demuxer + patch
+- [docs/integration.md](../integration.md) — общий integration guide
+- [docs/protocol.md §10](../protocol.md#10-v02-extension-encoded-packet-ring-proto_version2) — wire-protocol spec для packet ring
+- [BENCHMARKS.md](../../BENCHMARKS.md) — production-measured результаты
+- [ROADMAP.md](../../ROADMAP.md) — v0.3+ planned features
@@ -0,0 +1,47 @@
+# Launch drafts
+
+Drafts для outreach / launch. Все — **draft material**, перед отправкой review.
+
+## Порядок (рекомендуемый)
+
+1. **`frigate-integration-issue.md`** — soft-launch, низкий риск отказа, целевая
+   аудитория уже жалуется на проблему в 3 discussion'ах. Может дать первых
+   early-adopter'ов и social proof для следующего шага.
+2. **`ffmpeg-devel-rfc.md`** — после того как Frigate-discussion получит
+   позитивный engagement (даже один "+1, would use" комментарий — уже traction).
+   Mailing-list FFmpeg-devel предъявляет высокий стандарт; готовиться тщательно.
+3. **`hn-show-post.md`** — финальный, после того как либо RFC получит первый
+   response, либо ясно что молчат. HN — это amplifier, не starting line.
+
+## Что в каждом draft
+
+| Файл | Куда | Формат | Когда |
+|---|---|---|---|
+| [`frigate-integration-issue.md`](frigate-integration-issue.md) | github.com/blakeblackshear/frigate | Discussion (Ideas category) | Сейчас |
+| [`ffmpeg-devel-rfc.md`](ffmpeg-devel-rfc.md) | `ffmpeg-devel@ffmpeg.org` | Patch + cover letter via `git send-email` | После Frigate engagement |
+| [`hn-show-post.md`](hn-show-post.md) | news.ycombinator.com | Show HN | Etap F (finale) |
+
+## Что **не** делать
+
+- Не публиковать всё сразу в один день — невозможно отвечать на all-channels параллельно.
+- Не публиковать в выходные / праздники / во время большого tech-event (Apple keynote, GTC, etc).
+- Не упоминать "AI", "battle-tested", "production-ready", "enterprise" в тексте — все эти аудитории (FFmpeg-devel, Frigate, HN) аллергичны к маркетинговому языку.
+- Не публиковать FFmpeg patch **без** sign-off — automatic rejection.
+- Не отправлять HN-пост если не можешь быть онлайн первые 2 часа после публикации — ранжирование умрёт.
+
+## Что подготовить перед отправкой
+
+- [ ] Subscribe на ffmpeg-devel (https://ffmpeg.org/mailman/listinfo/ffmpeg-devel) — иначе reply'ы не получишь
+- [ ] `git config --global` для send-email (см. ffmpeg-devel-rfc.md шаги)
+- [ ] Sign-off в FFmpeg commit (`git commit --amend -s` если ещё нет)
+- [ ] GitHub аккаунт для Frigate discussion (если нет уже)
+- [ ] HN аккаунт с пара дней истории — fresh accounts автоматически шадо-банятся
+
+## После отправки
+
+Следить за reply'ями в течение первой недели. Все три канала — асинхронные, но первые **48 часов** обычно решающие.
+
+Куда смотреть статус engagement:
+- ffmpeg-devel: https://ffmpeg.org/pipermail/ffmpeg-devel/
+- Frigate discussion: появится в правой панели repo
+- HN: https://news.ycombinator.com/threads?id=YOURUSER
@@ -0,0 +1,160 @@
+# FFmpeg-devel RFC submission
+
+**Status:** DRAFT — review перед отправкой.
+
+**Куда:** `ffmpeg-devel@ffmpeg.org` (subscribe: https://ffmpeg.org/mailman/listinfo/ffmpeg-devel)
+
+**Как:** patch генерится через `git format-patch`, отправляется `git send-email` с cover-letter. FFmpeg **не использует** GitHub PR / pull-request — только mailing-list patches.
+
+---
+
+## Шаги отправки
+
+```bash
+# 1. Конфигурация git send-email (один раз)
+git config --global sendemail.smtpserver smtp.gmail.com
+git config --global sendemail.smtpserverport 587
+git config --global sendemail.smtpencryption tls
+git config --global sendemail.smtpuser ВАШ-EMAIL
+# password — через ~/.netrc или интерактивно
+
+# 2. На fork ffmpeg-patched, в ветке n7.1-cuframes:
+cd /path/to/ffmpeg-patched
+git log --oneline n7.1..n7.1-cuframes  # должна быть одна commit
+
+# 3. Подготовить .patch
+git format-patch -1 --cover-letter --subject-prefix='RFC PATCH' \
+    --output-directory=/tmp/cuframes-rfc \
+    n7.1..n7.1-cuframes
+
+# 4. Отредактировать /tmp/cuframes-rfc/0000-cover-letter.patch:
+#    - Заменить *** SUBJECT HERE *** → см. ниже
+#    - Заменить *** BLURB HERE *** → cover-letter body (см. ниже)
+
+# 5. Dry-run
+git send-email --dry-run --to=ffmpeg-devel@ffmpeg.org /tmp/cuframes-rfc/*.patch
+
+# 6. Реальная отправка
+git send-email --to=ffmpeg-devel@ffmpeg.org /tmp/cuframes-rfc/*.patch
+```
+
+## Subject line
+
+```
+[RFC PATCH 0/1] libavformat/cuframesdec: zero-copy CUDA frame ingest via IPC
+```
+
+## Cover-letter body
+
+```
+Hi all,
+
+This RFC adds a new demuxer "cuframes" to libavformat that ingests already-
+decoded video frames residing in CUDA device memory, produced by another
+process via the libcuframes IPC layer [1].
+
+# Why
+
+In multi-consumer GPU video pipelines (CCTV with multiple analytics
+services, multi-stream transcoding farms, ML inference + recording on the
+same source) every consumer typically runs its own NVDEC session. On 16
+cameras × 25 fps × N consumers this multiplies NVDEC sessions, OS
+context-switches and host<->device PCIe traffic for what is logically the
+same decoded frame.
+
+cuframes addresses this by letting one process decode (e.g. via FFmpeg's
+existing CUDA hwaccel) and publish the decoded frames into a small CUDA
+ring buffer; other processes import the buffer via cudaIpcOpenMemHandle
+and consume the same VRAM allocation without redecoding or copying.
+
+The libavformat demuxer in this RFC is the consumer side: it exposes the
+remote ring buffer as a regular AVFormat input source, so any downstream
+FFmpeg filter chain or muxer can use it transparently.
+
+# Scope of this patch
+
+  libavformat/cuframesdec.c   — new demuxer
+  libavformat/allformats.c    — registration
+  configure                   — --enable-libcuframes option
+
+The demuxer currently outputs NV12 frames via cudaMemcpy2DAsync to host
+memory (rawvideo path). A v0.2 follow-up is planned that emits frames
+directly as CUDA AVHWFramesContext (true zero-copy into a CUDA-aware
+filter chain) — see [2].
+
+# Out-of-tree library
+
+libcuframes (the producer side, the IPC handshake, the ring-buffer
+allocator) lives out-of-tree at [1], licensed LGPL-2.1+ to match FFmpeg.
+The demuxer links against libcuframes via pkg-config.
+
+This mirrors the model used by other libavformat plugins that wrap third-
+party libraries (libsmbclient, librist, libsrt, etc.).
+
+# Testing
+
+- Unit smoke tests in the libcuframes repo (1 publisher × 4 subscribers ×
+  2000 frames @ 120 fps — 0 torn frames, 0 gaps).
+- E2E test against a real RTSP IP camera (Dahua HEVC 1920×1080, 25 fps,
+  100/100 frames, avg_fps=25.03).
+- ~24h production deployment serving Frigate (object detection) and a
+  custom analytics pipeline from a single decoder, single NVDEC session.
+
+# Prior art and what this is not
+
+There is no in-tree mechanism for sharing decoded GPU frames between
+unrelated FFmpeg processes. Existing alternatives are:
+  - CUDA hwdownload + hwupload (defeats the purpose — round-trips via PCIe)
+  - DeepStream Gst-nvstreammux (NVIDIA, closed, GStreamer-only)
+  - Vendor-locked NVENC/NVDEC pooling helpers
+
+cuframes is intentionally minimal: ring buffer + handshake + IPC handles.
+No transcoding logic, no policy.
+
+# Limitations / known issues for review
+
+  - NVIDIA GPUs only (CUDA IPC is vendor-specific).
+  - Linux only (POSIX SHM + AF_UNIX sockets).
+  - Producer and consumer must share the same CUDA device (CUDA IPC limit).
+  - NV12 only in v0.1; other pixel formats are roadmap items.
+  - Driver ≥ 525, CUDA toolkit ≥ 12.0 (≥ 13.0 recommended).
+
+# Feedback wanted
+
+  1. Is the libavformat demuxer the right home for this, or would a
+     hwcontext_cuda extension + a thin demuxer be a better split?
+  2. Are folks open to an out-of-tree library dependency under
+     --enable-libcuframes, given the precedent of librist/libsrt?
+  3. Naming: "cuframes" vs "cudaipcframes" vs something else?
+
+Happy to iterate. Patch follows.
+
+[1] https://git.goldix.org/gx/cuframes  (LGPL-2.1+)
+[2] https://git.goldix.org/gx/cuframes/issues/2  (v0.2 zero-copy plan)
+
+Signed-off-by: <YOUR NAME> <YOUR EMAIL>
+```
+
+## Notes на review
+
+- **Subject prefix `[RFC PATCH]`** — потому что это design discussion, не "merge this now". Если получите конструктивный feedback и сделаете revision — следующая будет `[PATCH v2]`.
+- **Sign-off обязателен** — иначе patch отклонят на уровне tooling.
+- **Не упоминать** "production-ready", "battle-tested", "30 days of uptime" — FFmpeg-devel список **очень** аллергичен на маркетинговый тон. Numbers OK, эпитеты нет.
+- **Не CC** maintainers без приглашения — ответят те, кому интересно. Можно CC Timo Rothenpieler (CUDA hwaccel maintainer) если хочется ускорить — но **только** после первого revision если тишина.
+- Возможные возражения:
+  - "Why not Vulkan video?" — Vulkan video не имеет cross-process sharing API на уровне CUDA IPC. Vulkan external memory работает с DMA-BUF на Linux но требует DRM device sharing, что тоже non-trivial — отдельный RFC материал.
+  - "Why a new demuxer, not a filter?" — потому что producer уже **вне** этого FFmpeg-процесса; demuxer — это место где AVFormat читает из внешнего источника. Filter pull'ает из upstream AVStream — здесь нет upstream.
+
+## Альтернативный путь — ffmpeg-user (lighter)
+
+Если кажется что для `-devel` сразу с patch'ем тяжело — можно начать с **awareness email** в `ffmpeg-user@ffmpeg.org`:
+
+```
+Subject: ANNOUNCE: libcuframes — zero-copy CUDA frame sharing for FFmpeg pipelines
+
+[3 параграфа: what / why / link to repo]
+
+Patch для libavformat будет отправлен в -devel список после feedback от пользователей.
+```
+
+Это **soft launch** — мень рисков отказа, больше шансов получить early adopters которые потом support'ят RFC. Рекомендую этот шаг **сначала**.
@@ -0,0 +1,115 @@
+# Frigate integration issue
+
+**Status:** DRAFT — review перед публикацией.
+
+**Куда:** https://github.com/blakeblackshear/frigate
+
+**Тип:** GitHub **Discussion** (category: Ideas), **не** Issue. Причина: это feature proposal, не баг. Frigate активно использует discussions (см. [#17033](https://github.com/blakeblackshear/frigate/discussions/17033), [#20191](https://github.com/blakeblackshear/frigate/discussions/20191), [#21559](https://github.com/blakeblackshear/frigate/discussions/21559) — все три уже жалуются на эту проблему).
+
+**Альтернатива:** ответить в одной из существующих discussion'ов о NVDEC saturation. Может быть лучше — там уже собралась audience.
+
+---
+
+## Title
+
+```
+[Ideas] Reduce NVDEC duplication on multi-consumer cameras via shared CUDA frame buffer (cuframes)
+```
+
+## Body
+
+```markdown
+## Problem
+
+When Frigate co-exists with other GPU-using video consumers on the same
+camera stream (separate AI processor, custom analytics, recording to a
+second NVR, etc.), each process opens its own NVDEC session and decodes
+the same H.264/HEVC stream independently. On 16+ cameras at 25 fps this
+becomes the bottleneck on consumer GPUs:
+
+- NVDEC sessions are limited (4 concurrent on RTX 30xx/40xx, more on
+  workstation cards). Decoder context creation / destruction is not free.
+- Each duplicate decode burns PCIe bandwidth pushing the same NV12 frame
+  to host memory (in setups that go through `hwdownload`).
+- Power draw and thermals scale with redundant decoding.
+
+Related discussions: #17033, #20191, #21559.
+
+## Existing workarounds
+
+- Single Frigate restream and have everything else pull from go2rtc — works
+  for re-encoding to TCP/UDP, but every downstream still re-decodes.
+- DeepStream `nvstreammux` — solves it but is closed-source NVIDIA stack,
+  GStreamer-only, not co-installable with current Frigate ffmpeg pipeline.
+
+## Proposal: cuframes ingest source
+
+[cuframes](https://git.goldix.org/gx/cuframes) (LGPL-2.1+) is a small
+library that lets one process decode once into a CUDA ring buffer and any
+number of other processes import that buffer via CUDA IPC and consume
+**zero-copy** in VRAM.
+
+Concretely for Frigate this would mean a new ffmpeg input source like:
+
+```yaml
+cameras:
+  driveway:
+    ffmpeg:
+      inputs:
+        - path: cuframes://driveway
+          input_args: preset-cuframes
+          roles: [detect]
+```
+
+where a sentinel container (one per camera, ~5MB RAM, runs
+`cuframes-rtsp-source`) does the actual RTSP pull + NVDEC and Frigate
+attaches to that pre-decoded stream.
+
+## Working integration (early proof)
+
+I've been running this in production for ~24h: a single
+`cuframes-rtsp-source` container per camera serves both Frigate
+(detection role) **and** a separate C++ analytics pipeline from the same
+NVDEC session. Frigate gets pre-decoded NV12 frames; no detection or
+recording behaviour was changed.
+
+Integration guide with full docker-compose and a patched Frigate Dockerfile:
+https://git.goldix.org/gx/cuframes/src/branch/main/docs/integrations/frigate.md
+
+## What I'm asking for
+
+Not a PR yet — first I'd like maintainer / community input on:
+
+1. Would Frigate be open to **upstream** a `cuframes://` input source, or
+   should this stay a third-party patched Frigate image?
+2. If upstream — what's the preferred shape: new ffmpeg preset only
+   (zero core code changes), or a first-class `decoder: cuframes` option
+   in the Frigate config schema?
+3. The cuframes library currently requires `--ipc` and `--pid` namespace
+   sharing between producer and consumer containers. Frigate uses
+   `s6-overlay` which is incompatible with `--pid` share (s6 needs PID 1).
+   The current integration uses a small race-window workaround
+   ([troubleshooting #2](https://git.goldix.org/gx/cuframes/src/branch/main/docs/troubleshooting.md));
+   a cleaner solution requires either making s6 optional in the Frigate
+   image or moving the IPC handshake to a sidecar pattern.
+
+## Limitations of cuframes (full disclosure)
+
+- NVIDIA GPUs only.
+- Linux only.
+- Producer + consumer must share the same CUDA device.
+- NV12 frame format only in v0.1.
+- Requires patching FFmpeg with a small (~400 LOC) demuxer; an upstream
+  FFmpeg RFC is in flight separately.
+
+If this looks worth pursuing I'm happy to open a draft PR against a feature
+branch and iterate.
+```
+
+## Notes на review
+
+- **Tone:** Frigate maintainer (Blake) ценит конкретику и production proof — без них любой feature request кладётся в backlog. У нас есть production proof (24h+) — это сильный аргумент, использован прямо.
+- **Не обещаем upstream без request'а** — спрашиваем discussion'ом, не PR'ом. Если Blake скажет "не наш scope, оставайтесь third-party" — это OK; integration guide уже валиден как standalone.
+- **Прозрачно про s6-overlay constraint** — это блокирующий issue для clean upstream'а. Лучше упомянуть сразу чем спрятать и получить отказ через 2 недели review.
+- **Линки на 3 existing discussions** — показывает что problem подтверждена сообществом, не наша одинокая боль.
+- **Не упоминать другие AI-системы** (ANPR, face recognition итд) — Blake уже несколько раз говорил что Frigate scope = детектор и NVR, не platform. Подача "cuframes решает вашу проблему" работает лучше чем "cuframes построит экосистему".
@@ -0,0 +1,107 @@
+# Show HN post (для Etap F — позже)
+
+**Status:** DRAFT — не публикуем сейчас. Этот файл черновик к Etap F (launch).
+
+**Куда:** https://news.ycombinator.com/submit
+
+**Когда публиковать:**
+- После того как FFmpeg-devel RFC получит первый response (даже отказ — это traction)
+- ИЛИ после того как Frigate discussion получит +5 upvotes / 3+ комментариев
+- ИЛИ если оба молчат 2 недели — публиковать в любом случае, HN-аудитория более независимая
+- **Время:** будний день, 13:00-15:00 UTC (peak HN traffic from US morning + EU afternoon)
+- **Не публиковать** в пятницу вечером / в выходные / в крупный tech-event день (Apple keynote, GTC, etc.) — drown'ит в шуме
+
+---
+
+## Title
+
+Опции (выбрать одну):
+
+1. `Show HN: Cuframes – zero-copy sharing of decoded video frames between processes via CUDA IPC`
+2. `Show HN: Stop redecoding the same RTSP stream in every consumer`
+3. `Show HN: Cuframes – one NVDEC, many consumers, zero-copy in VRAM`
+
+Рекомендую **#2** — describes problem in 7 words, HN любит problem-first titles. #1 — для технической HN ниши тоже OK.
+
+## Body
+
+```markdown
+Hi HN,
+
+I run a homelab CCTV stack with 16 cameras feeding into Frigate (object
+detection), a custom C++ analytics service, and a recording NVR. All three
+were running NVDEC on the same RTSP streams. On an RTX 3060 this saturated
+the decoder slots and the consumer GPUs in my office burnt about 40W of
+redundant decoding when nothing interesting was happening.
+
+So I wrote a small library that lets one process decode the stream once
+into a CUDA ring buffer and the others import the same buffer via
+cudaIpcOpenMemHandle. Decoded NV12 frame lands in VRAM exactly once, every
+consumer reads it zero-copy.
+
+Repo (LGPL-2.1+): https://git.goldix.org/gx/cuframes
+
+What's in it:
+
+  - libcuframes — the producer/consumer C/C++ library
+  - cuframes-rtsp-source — standalone RTSP → cuframes bridge (one per cam)
+  - A small out-of-tree FFmpeg demuxer ("cuframes://") so downstream
+    consumers don't need to know they're consuming shared frames
+  - Reference docker-compose for the Frigate + custom-app setup
+  - 24h production deployment on the homelab, ~25 fps × 16 cameras × 3
+    consumers from a single NVDEC session
+
+What surprised me along the way:
+
+  - CUDA IPC handles are bound to the device that allocated them, not just
+    a CUDA context — both peers must be on the same GPU. (Documented;
+    bit out of the way in the Programming Guide §3.2.8.)
+  - Cross-container CUDA IPC needs both --ipc and --pid namespace share,
+    not just --ipc. The latter wasn't obvious from the error message
+    ("invalid device context" with no mention of /proc visibility).
+  - Frigate's s6-overlay is incompatible with --pid share because s6
+    insists on being PID 1. There's a documented race-window workaround
+    but it's the one rough edge.
+
+What it is not:
+
+  - Not a transcoding framework. No re-encoding, no filtering, no policy.
+  - Not multi-GPU (CUDA IPC is single-device).
+  - Not Windows / macOS / WSL2 / AMD.
+
+What's next:
+
+  - Upstream FFmpeg RFC for the demuxer (drafted, not sent yet — would
+    appreciate review of the RFC text first).
+  - v0.2 makes the FFmpeg path true zero-copy via AVHWFramesContext (no
+    cudaMemcpy2DAsync round-trip).
+
+Happy to answer questions. Especially interested in:
+
+  - Anyone running multi-consumer GPU video pipelines with a different
+    solution? Curious what tradeoffs you hit.
+  - Vulkan-video folks: is there an obvious cross-process sharing path
+    via VkExternalMemory + DMA-BUF that I'm missing? I went CUDA-only
+    because that's what worked first, but Vulkan would be vendor-neutral.
+
+— [your handle]
+```
+
+## Notes на review
+
+- **HN формат:** первая строка — hook (concrete problem, concrete numbers — "40W redundant decoding"). НЕ начинать с "Hi everyone, today I'm excited to share..."
+- **Без emoji**, без markdown headers (HN не renders'ит markdown в title-area; body тоже почти plain text)
+- **Конкретные числа** — HN respect'ит numbers. "40W", "24h", "25 fps × 16 cam × 3 consumer", "~400 LOC patch"
+- **"What it is not"** — отсекает Vue Apologists которые иначе пишут "why don't you support Windows?". Это HN best practice
+- **Open questions внизу** — driver discussion. Без них первый комментарий = "и зачем это?". С ними — "вот мой опыт с DeepStream"
+- **Avoid:** "battle-tested", "production-ready", "enterprise-grade", "10x faster than X" — HN crowd специально downvotes такое
+- **Будь готов** отвечать **первые 2 часа** активно — HN ранжирование сильно зависит от engagement в первый час. Если не сможешь быть в офлайне — не публикуй
+- **Если автор — не main maintainer** repo — упомянуть это в первом комменте от собственного аккаунта чтобы не выглядело как третье-лицо PR
+
+## Альтернатива — r/selfhosted
+
+Если HN кажется слишком high-stakes, можно сначала **r/selfhosted** (180k subs) — там Frigate-аудитория, прямой fit. Менее brutal, легче получить early feedback.
+
+Title для reddit: `Reduced NVDEC saturation across Frigate + custom apps by sharing decoded frames over CUDA IPC — open-sourced the library`
+
+Этот текст короче (HN body слишком длинный для reddit), но идея та же.
@@ -423,3 +423,400 @@ TEST(Handshake, HelloRespMismatchProto) {
 `libcuframes/src/protocol.c` (Phase 1, Step 2) — единственная reference.
 Любая другая реализация (Python ctypes, Rust bindings, FFmpeg plugin)
 должна **conformance-tested** против этого документа.
+
+## 10. v0.2 extension: encoded packet ring (proto_version=2)
+
+**Статус:** design draft, ещё не реализовано (см. issue #2).
+
+Параллельно с decoded-frames ring (§2) publisher может опционально
+поддерживать **encoded packet ring** — публикует raw H.264/H.265 NAL units
+**до** decoder, для consumer'ов которые делают `-c:v copy` (recording, mux).
+
+### 10.1 Совместимость с v1
+
+- v2 publisher принимает **v1-subscribers** — они получают только frames
+  ring (как v0.1), packet ring им не показывается.
+- v1 publisher отвергает v2-subscribers с `wants_packets=true`
+  (HELLO_RESP error PROTOCOL).
+- v1 layout (§2) **не меняется** для frames ring — packet ring это отдельный SHM.
+
+Publisher version bumping:
+- `proto_version` = 2 в SHM header и в HELLO_RESP когда packet ring active.
+- Если publisher v2 не активирует packet ring (`enable_packet_ring=false`)
+  — `proto_version` остаётся 1 (полная v1 compat).
+
+### 10.2 Дополнительные ресурсы
+
+| Resource | Path | Назначение | Когда |
+|---|---|---|---|
+| Packet shared memory | `/dev/shm/cuframes-<key>-packets` | Packet ring header + slots + byte buffer | если publisher активировал packet ring |
+
+Cleanup — симметрично §1: `shm_unlink` при destroy(); orphaned автоматически
+если nobody mmap'ит.
+
+### 10.3 Packet ring layout
+
+Размер пакетного SHM: `sizeof(packet_ring_header_t) + N×PSE + DATA_SIZE`,
+где:
+- N = `packet_ring_slots`, default 64 (configurable)
+- PSE = `sizeof(packet_slot_entry_t)` = 64 байт (см. §10.5)
+- DATA_SIZE = `packet_data_size`, default 8 MB (configurable)
+
+#### Byte layout
+
+```
+Offset                  Size   Field                       Comments
+─────────────────────── ────── ──────────────────────────  ─────────────────────────────
+0x0000                       4 magic (LE u32)              0xCC7C1DCD  (frames magic + 1)
+0x0004                       4 proto_version (LE u32)      2
+0x0008                       4 ring_slots (LE u32)         N (1..1024)
+0x000C                       4 data_size (LE u32)          bytes for packet data ring
+0x0010                       4 codec_id (LE u32)           AV_CODEC_ID_* enum (см. §10.4)
+0x0014                       4 codec_extradata_size (LE u32)  ≤ 4096
+0x0018                       8 producer_pid (LE u64)
+0x0020                       8 global_seq (LE u64, atomic) монотонная по packets
+0x0028                       8 last_keyframe_seq (LE u64, atomic)  для late subscribers
+0x0030                       8 write_offset (LE u64, atomic) текущий cursor в data ring
+0x0038                       8 shutdown_flag (LE u64, atomic)
+0x0040                    4096 codec_extradata             SPS/PPS/VPS bytes (см. §10.4)
+0x1040                    N×64 slots[N]                    packet_slot_entry_t (см. §10.5)
+0x1040+N×64           DATA_SIZE data[]                     wraparound byte buffer
+```
+
+Все atomic fields — C11 `_Atomic` (release/acquire semantics для seq updates).
+
+### 10.4 Codec extradata
+
+H.264 — SPS + PPS, конкатенированные в **Annex B** формате
+(start codes `00 00 00 01`). H.265 — VPS + SPS + PPS.
+
+`codec_id` соответствует FFmpeg `AV_CODEC_ID_H264`, `AV_CODEC_ID_HEVC`,
+`AV_CODEC_ID_AV1` (future). Subscriber пишет этот extradata в
+`AVCodecContext.extradata` своего decoder'а (если он его создаёт)
+или в `AVStream.codecpar->extradata` для muxer'ов.
+
+Extradata устанавливается publisher'ом **один раз** при первом keyframe
+(или из RTSP SDP до первого packet). После — fixed на lifetime publisher'а
+(codec change mid-stream → publisher destroy+recreate с новым `<key>`).
+
+### 10.5 Packet slot entry (64 байта)
+
+```
+Offset  Size   Field                       Comments
+0x00      8    seq (LE u64, atomic)        published seq; UINT64_MAX = invalid
+0x08      8    pts_ns (LE i64)
+0x10      8    dts_ns (LE i64)             для B-frames pipelines
+0x18      8    data_offset (LE u64)        offset в `data[]` секции SHM
+0x20      4    data_size (LE u32)          size of payload bytes
+0x24      4    flags (LE u32)              §10.6
+0x28     24    reserved                    0
+```
+
+`data_offset` может быть **больше** `data_size` секции SHM — semantics
+"absolute byte cursor", фактический byte index = `data_offset % data_size`.
+Subscriber может detect wrap (если payload crosses end → split read).
+
+### 10.6 Packet flags
+
+```
+Bit  Name              Comments
+0    KEY               keyframe (IDR for H.264, или CRA/IDR для HEVC).
+                       Critical для late subscribers — must wait IDR.
+1    CORRUPT           publisher detect'нул что packet damaged
+                       (RTP loss и т.п.) — subscriber может skip
+2    DISCONTINUITY     был gap перед этим packet
+                       (publisher reconnect к камере)
+3    LAST_IN_AU        last NAL в access unit (полный frame)
+                       — для muxer'ов которые ждут полный frame
+4-31 reserved          0
+```
+
+Mapping в `AVPacket.flags`:
+- bit 0 (KEY) → `AV_PKT_FLAG_KEY`
+- bit 1 (CORRUPT) → `AV_PKT_FLAG_CORRUPT`
+- bit 2 (DISCONTINUITY) → `AV_PKT_FLAG_DISCONTINUITY` (FFmpeg 5+)
+
+### 10.7 Atomic publish (publisher-side)
+
+```c
+// Pseudo-C (упрощено, без error handling)
+uint64_t seq = atomic_load(&hdr->global_seq, RELAXED) + 1;
+uint64_t off = atomic_load(&hdr->write_offset, RELAXED);
+
+// 1. Найти free slot (overwrite oldest)
+size_t slot_idx = seq % hdr->ring_slots;
+packet_slot_entry_t *slot = &slots[slot_idx];
+
+// 2. Записать payload bytes (wraparound, может потребовать 2 memcpy)
+size_t off_in_ring = off % hdr->data_size;
+size_t first_chunk = min(size, hdr->data_size - off_in_ring);
+memcpy(data + off_in_ring, payload, first_chunk);
+if (first_chunk < size)
+    memcpy(data, payload + first_chunk, size - first_chunk);
+
+// 3. RELEASE: записать metadata в slot
+slot->pts_ns = pts;
+slot->dts_ns = dts;
+slot->data_offset = off;
+slot->data_size = size;
+slot->flags = flags;
+atomic_store(&slot->seq, seq, RELEASE);
+
+// 4. Update global cursor + global_seq
+atomic_store(&hdr->write_offset, off + size, RELEASE);
+atomic_store(&hdr->global_seq, seq, RELEASE);
+
+// 5. If KEY → update last_keyframe_seq
+if (flags & PKT_FLAG_KEY)
+    atomic_store(&hdr->last_keyframe_seq, seq, RELEASE);
+```
+
+### 10.8 Atomic read (subscriber-side)
+
+```c
+// Pseudo-C
+uint64_t cur = atomic_load(&hdr->global_seq, ACQUIRE);
+if (cur <= my_last_seq)  return TIMEOUT;  // ничего нового
+
+uint64_t want_seq = my_last_seq + 1;
+size_t slot_idx = want_seq % hdr->ring_slots;
+packet_slot_entry_t *slot = &slots[slot_idx];
+
+uint64_t slot_seq = atomic_load(&slot->seq, ACQUIRE);
+if (slot_seq != want_seq) {
+    // overrun — slow subscriber. Re-anchor:
+    want_seq = atomic_load(&hdr->last_keyframe_seq, ACQUIRE);
+    slot_idx = want_seq % hdr->ring_slots;
+    slot = &slots[slot_idx];
+    return DROPPED;  // signal user через flags = DISCONTINUITY
+}
+
+// Copy payload (wraparound aware)
+uint64_t off = slot->data_offset % hdr->data_size;
+uint32_t size = slot->data_size;
+uint32_t first_chunk = min(size, hdr->data_size - off);
+memcpy(out_buf, data + off, first_chunk);
+if (first_chunk < size)
+    memcpy(out_buf + first_chunk, data, size - first_chunk);
+
+// Re-check slot->seq не изменился (защита от overrun mid-read)
+if (atomic_load(&slot->seq, ACQUIRE) != want_seq) {
+    return DROPPED;  // publisher overwrote во время copy
+}
+
+my_last_seq = want_seq;
+return OK;
+```
+
+Защита от overrun mid-read через **post-check `slot->seq`** — простая
+вариант seqlock. Если publisher успел overwrite между metadata-read и
+data-copy — subscriber detect и retry.
+
+### 10.9 Socket protocol extensions
+
+#### HELLO_REQ — добавляются flags в reserved field
+
+v1 layout (§3.3):
+```
+[4 bytes]  proto_version
+[4 bytes]  consumer_name_len
+[N bytes]  consumer_name
+[4 bytes]  cuda_device
+[4 bytes]  mode
+[12 bytes] reserved (must be 0)  ← v0.2 использует первые 4 байта
+```
+
+v0.2 интерпретирует первые 4 байта `reserved` как `subscribe_flags`:
+
+| Bit | Name | Comments |
+|---|---|---|
+| 0 | `WANTS_FRAMES` | подписаться на decoded frames ring (default ON в v1 — implicit) |
+| 1 | `WANTS_PACKETS` | подписаться на encoded packet ring |
+| 2-31 | reserved | 0 |
+
+Если v1-subscriber оставляет reserved=0 — publisher v2 интерпретирует это
+как `WANTS_FRAMES=true, WANTS_PACKETS=false` (v1 backward-compat).
+
+#### HELLO_RESP — добавляются packet-ring fields
+
+v1 layout (§3.4) расширяется в reserved секции:
+
+```
+[4 bytes]  result
+[4 bytes]  proto_version_actual         ← теперь может быть 1 или 2
+[4 bytes]  ring_size                    ← frames ring
+[4 bytes]  ownership_mode
+[64 bytes] frame_meta
+[4 bytes]  shm_path_len                 ← frames SHM
+[N bytes]  shm_path
+[12 bytes] reserved                     ← v0.2 интерпретирует
+```
+
+v0.2 reserved layout (если `proto_version_actual == 2` И publisher
+поддерживает packets):
+```
+[4 bytes]  packet_shm_path_len (LE u32) 0 = packets disabled at publisher
+[N bytes]  packet_shm_path (UTF-8)      — относительно /dev/shm/, например "cuframes-camA-packets"
+[4 bytes]  codec_id (LE u32)            AV_CODEC_ID_*
+[4 bytes]  initial_packet_seq (LE u64)  last_keyframe_seq на момент handshake
+                                        (subscriber должен start с этого seq)
+```
+
+Если subscriber запросил `WANTS_PACKETS=1` но publisher не имеет packet ring
+— `result = ERR_NOT_AVAILABLE`.
+
+### 10.10 Subscriber state machine extension
+
+Подключение к **обоим** rings (или одному из):
+
+```
+   ┌──────────┐
+   │ HELLO_OK │ proto_version_actual=2, packet_shm_path_len>0
+   └────┬─────┘
+        │
+        ▼
+   ┌────────────────────────────────┐
+   │ Open frames SHM (если WANTS_FRAMES) │ → standard v1 flow
+   └────────────────────────────────┘
+        │
+        ▼
+   ┌────────────────────────────────┐
+   │ Open packet SHM (если WANTS_PACKETS) │
+   │ - mmap /dev/shm/cuframes-<key>-packets │
+   │ - check magic, proto_version            │
+   │ - set my_last_packet_seq = initial_packet_seq - 1 │
+   │   (так что первый next_packet вернёт IDR)         │
+   └────────────────────────────────┘
+        │
+        ▼
+   ┌─────────┐
+   │ READY   │ — frames или packets или оба доступны
+   └─────────┘
+```
+
+### 10.11 Threading в subscriber
+
+Frames ring и packet ring имеют **разные** `global_seq` counters.
+Subscriber имеет **отдельные** `my_last_seq` для каждого. Может
+poll'ить обе независимо (или через два threads).
+
+Producer's `cudaEventRecord` (frames sync) не релевантен для packets —
+encoded data на CPU, без CUDA sync.
+
+### 10.12 Конфигурируемость packet ring
+
+Publisher API extension (§10.13) принимает параметры:
+
+```c
+typedef struct {
+    uint32_t packet_ring_slots;   // default 64
+    uint32_t packet_data_size;    // default 8 MB (8388608)
+    uint32_t max_packet_size;     // default 2 MB — sanity guard для оversized
+                                  // packets (publisher rejects with error)
+    uint32_t codec_id;            // AV_CODEC_ID_H264 / HEVC / ...
+} cuframes_packet_ring_options_t;
+```
+
+### 10.13 API extension (для cuframes.h)
+
+```c
+/* Сreate publisher с активным packet ring. NULL для opts → packet ring disabled. */
+int cuframes_publisher_create_ex(
+    const cuframes_publisher_options_t *frames_opts,
+    const cuframes_packet_ring_options_t *packet_opts, /* NULL = no packet ring */
+    cuframes_publisher_t **pub_out
+);
+
+/* Set codec extradata (SPS/PPS) — должен быть called до первого publish_packet. */
+int cuframes_publisher_set_codec_extradata(
+    cuframes_publisher_t *pub,
+    const void *extradata,
+    size_t size
+);
+
+/* Публикация packet. Slow consumer = overwrite oldest. */
+int cuframes_publisher_publish_packet(
+    cuframes_publisher_t *pub,
+    const void *data,
+    size_t size,
+    int64_t pts_ns,
+    int64_t dts_ns,
+    uint32_t flags   /* CUFRAMES_PKT_FLAG_KEY | _CORRUPT | _DISCONTINUITY | _LAST_IN_AU */
+);
+
+/* Subscriber-side: подписаться с opt-in для packets. */
+typedef struct {
+    /* ... existing v1 fields ... */
+    uint32_t subscribe_flags;  /* WANTS_FRAMES, WANTS_PACKETS bits */
+} cuframes_subscriber_options_v2_t;
+
+int cuframes_subscriber_create_v2(
+    const cuframes_subscriber_options_v2_t *opts,
+    cuframes_subscriber_t **sub_out
+);
+
+/* Чтение packet. Opaque handle — каллер вызывает release_packet после. */
+typedef struct cuframes_packet cuframes_packet_t;
+
+int cuframes_subscriber_next_packet(
+    cuframes_subscriber_t *sub,
+    cuframes_packet_t **pkt_out,
+    int32_t timeout_ms
+);
+
+const void * cuframes_packet_data(const cuframes_packet_t *p);
+size_t       cuframes_packet_size(const cuframes_packet_t *p);
+int64_t      cuframes_packet_pts(const cuframes_packet_t *p);
+int64_t      cuframes_packet_dts(const cuframes_packet_t *p);
+uint32_t     cuframes_packet_flags(const cuframes_packet_t *p);
+
+int cuframes_subscriber_release_packet(cuframes_subscriber_t *sub, cuframes_packet_t *p);
+
+/* Codec params для subscriber (extracted из shared header). */
+int cuframes_subscriber_get_codec_params(
+    cuframes_subscriber_t *sub,
+    uint32_t *codec_id_out,
+    const void **extradata_out,
+    size_t *extradata_size_out
+);
+```
+
+`cuframes_packet_t` opaque — фактически указатель в local-mapped data (на
+heap subscriber'а — copy при `next_packet`, освобождение при `release`).
+Subscriber **не** держит ссылки на shared ring data между `next_packet` и
+`release_packet` — это избавляет от reader-locks.
+
+### 10.14 Late subscriber → keyframe-aligned start
+
+При SUBSCRIBE_RESP publisher отвечает `initial_packet_seq = last_keyframe_seq`.
+
+Subscriber устанавливает `my_last_seq = initial_packet_seq - 1`, так что
+первый `next_packet` вернёт keyframe (decoder может start без glitches).
+
+**Risk:** если в момент handshake **last_keyframe_seq уже выехал из
+ring** (slow start subscriber, GOP > ring_slots packets) — subscriber
+detect overrun в первом read и переходит на следующий keyframe.
+
+В implementation `publisher_publish_packet` для оптимизации может маркировать
+slot перед IDR как **persistent** (флаг в reserved), но **v0.2 keep simple** —
+просто требуем что `packet_ring_slots × avg_packet_size > GOP_size_in_bytes`
+для нормальной работы. Sizing guide см. в [docs/integration.md](integration.md).
+
+### 10.15 Error codes (новые)
+
+| Code | Name | Когда |
+|---|---|---|
+| -20 | `CUFRAMES_ERR_PACKET_OVERSIZED` | publish_packet с size > max_packet_size |
+| -21 | `CUFRAMES_ERR_NO_PACKET_RING` | subscriber запросил packets, publisher без packet ring |
+| -22 | `CUFRAMES_ERR_NO_CODEC_PARAMS` | get_codec_params вызван до set_codec_extradata publisher'ом |
+| -23 | `CUFRAMES_ERR_PACKET_OVERRUN` | subscriber slow — packet seq уехал, надо resync на keyframe |
+
+### 10.16 Open для v0.3+
+
+- **Sub-stream selection** — publisher может публиковать несколько
+  packet rings (для multi-resolution streams). Сейчас один key = один stream.
+  v0.3 → `<key>-substream-<N>` naming?
+- **Codec change mid-stream** — текущий design требует publisher restart.
+  Future: invalidate codec_extradata + bump generation field.
+- **Audio streams** — analogichno в packet ring, но codec_id = audio (AAC,
+  Opus). v0.3.
@@ -0,0 +1,284 @@
+# cuframes Python bindings
+
+Status: **v0.4 — Phase 0 alpha** (issue [gx/cuframes#6](http://server:3000/gx/cuframes/issues/6))
+
+Python пакет `cuframes` — pybind11-обёртка над C ABI libcuframes. Цель —
+позволить downstream ML/CV пайплайнам (yolo-world-detector, zone-motion,
+custom скриптам) подписываться на cuframes **без CPU round-trip**: получать
+NV12 frames прямо как CUDA pointer / `torch.Tensor` (DLPack export, zero-copy
+из VRAM publisher'а в VRAM consumer'а).
+
+## Установка
+
+Standalone wheel (рекомендуемый):
+
+```bash
+cd cuframes/python/
+pip install -e . --no-build-isolation
+```
+
+Через корневой CMake:
+
+```bash
+cmake -B build -DBUILD_PYTHON_BINDINGS=ON
+cmake --build build -j
+```
+
+## Quick start
+
+```python
+import cuframes
+
+print(cuframes.version_string())  # "0.4.0"
+
+with cuframes.subscribe("cam-parking",
+                        consumer_name="yolo-world",
+                        connect_timeout_ms=5000) as sub:
+    with sub.next_frame(timeout_ms=1000) as frame:
+        print(f"{frame.width}x{frame.height} "
+              f"format={frame.format} seq={frame.seq}")
+```
+
+## API
+
+### `cuframes.subscribe(key, ...)`
+
+Создать подписку на publisher. Возвращает `CuframesSubscriber`.
+
+| Параметр | Тип | Default | Назначение |
+|---|---|---|---|
+| `key` | `str` | (required) | Имя publisher'а (`"cam-parking"` и т.п.) |
+| `consumer_name` | `str \| None` | `None` (auto-generated) | Идентификатор подписки |
+| `mode` | `SubscriberMode` | `NEWEST_ONLY` | `NEWEST_ONLY` skip'ит промежуточные frames, `STRICT_ORDER` — все по порядку |
+| `cuda_device` | `int` | `0` | CUDA device id |
+| `connect_timeout_ms` | `int` | `-1` (бесконечно) | Сколько ждать publisher'а |
+| `consumer_stream` | `int` | `0` (default stream) | `cudaStream_t` как pointer |
+
+### `CuframesSubscriber`
+
+Контекст-менеджер. Methods/properties:
+
+```python
+sub.next_frame(timeout_ms=-1)  # → CuframesFrame
+sub.close()                     # idempotent
+
+# read-only properties
+sub.key                # str
+sub.consumer_name      # str
+sub.mode               # SubscriberMode
+sub.cuda_device        # int
+sub.consumer_stream    # int (cudaStream_t ptr)
+sub.closed             # bool
+
+# health / stats (Phase 0 counters)
+sub.frames_received    # int
+sub.timeouts           # int
+sub.errors             # int
+sub.last_seq           # int (sequence number последнего frame'а)
+sub.gap_count          # int (proxy для drop count в NEWEST_ONLY)
+sub.last_frame_pts_ns  # int
+sub.stats()            # dict — snapshot всех counters для MQTT publish
+```
+
+### `CuframesFrame`
+
+Контекст-менеджер. Properties (read-only):
+
+```python
+frame.cuda_ptr   # int (uintptr_t)
+frame.format     # PixelFormat
+frame.width      # int
+frame.height     # int
+frame.pitch_y    # int — pitch Y plane (важно — может быть > width!)
+frame.pitch_uv   # int
+frame.seq        # int — sequence number у publisher'а
+frame.pts_ns     # int — CLOCK_MONOTONIC у publisher'а
+frame.released   # bool
+
+# DLPack export (zero-copy)
+frame.dlpack_y()       # capsule — Y plane как 2D uint8 GPU tensor
+frame.dlpack_uv()      # capsule — UV plane (только NV12)
+frame.__dlpack__()     # protocol для torch.from_dlpack(frame)
+frame.__dlpack_device__()  # (kDLCUDA=2, device_id)
+```
+
+## Интеграция с PyTorch
+
+```python
+import torch
+import cuframes
+
+with cuframes.subscribe("cam-parking", connect_timeout_ms=5000) as sub:
+    with sub.next_frame() as frame:
+        # Single-plane (default — Y plane для NV12)
+        y_tensor = torch.from_dlpack(frame)
+        # Multi-plane explicit
+        y = torch.from_dlpack(frame.dlpack_y())   # shape=[H, W] uint8
+        uv = torch.from_dlpack(frame.dlpack_uv()) # shape=[H/2, W] uint8
+
+        # Y plane уже в VRAM — никаких copy. Можно сразу feed в NN.
+        y_float = y.float() / 255.0  # будет на CUDA device
+```
+
+## Интеграция с CuPy
+
+```python
+import cupy
+import cuframes
+
+with cuframes.subscribe("cam-parking", connect_timeout_ms=5000) as sub:
+    with sub.next_frame() as frame:
+        y_array = cupy.from_dlpack(frame.dlpack_y())  # cupy.ndarray на GPU
+```
+
+## Pattern: reconnect-loop для долгоживущего consumer'а
+
+```python
+import time
+import cuframes
+
+def consume_camera(key: str, on_frame):
+    while True:
+        try:
+            with cuframes.subscribe(key, connect_timeout_ms=5000) as sub:
+                while True:
+                    try:
+                        with sub.next_frame(timeout_ms=1000) as frame:
+                            on_frame(frame)
+                    except cuframes.CuframesFrameTimeout:
+                        # просто нет новых кадров — продолжаем ждать
+                        continue
+        except cuframes.CuframesPublisherGone:
+            # publisher умер / перезапускается — переподписываемся
+            print(f"publisher {key} gone, reconnect через 1s")
+            time.sleep(1)
+        except cuframes.CuframesError as e:
+            # фатальная ошибка — логируем и продолжаем
+            print(f"error: {e!r}")
+            time.sleep(5)
+```
+
+## Per-subscriber CUDA stream
+
+В продакшене на 4+ камеры каждый subscriber должен иметь свой stream —
+иначе `cudaStreamWaitEvent` сериализует всех consumer'ов через default
+stream.
+
+С `cuda-python`:
+
+```python
+from cuda import cudart
+import cuframes
+
+err, stream = cudart.cudaStreamCreate()
+assert err == cudart.cudaError_t.cudaSuccess
+
+with cuframes.subscribe("cam-parking", consumer_stream=int(stream)) as sub:
+    ...
+```
+
+С `torch.cuda.Stream`:
+
+```python
+import torch
+import cuframes
+
+stream = torch.cuda.Stream()
+with cuframes.subscribe("cam-parking",
+                        consumer_stream=stream.cuda_stream) as sub:
+    with torch.cuda.stream(stream):
+        with sub.next_frame() as frame:
+            tensor = torch.from_dlpack(frame)
+            # ... inference на этом stream'е ...
+```
+
+## Pitch alignment — важно!
+
+NVDEC отдаёт NV12 с pitch alignment 256 байт. Для камер с шириной не
+кратной 256 (`gate_lpr 2688×1520` → pitch 2688 OK; но представьте `640×480`
+→ pitch обычно 640 байт, но **может быть 768**).
+
+```python
+# WRONG — assume pitch == width
+y = torch.frombuffer(...)  # данные смещены
+
+# RIGHT — использовать DLPack который сам respect'ит strides
+y = torch.from_dlpack(frame.dlpack_y())  # stride учтён правильно
+
+# ALTERNATIVELY — manual через cuda-python с правильным pitch
+ptr = frame.cuda_ptr
+pitch = frame.pitch_y
+height = frame.height
+```
+
+## Thread-safety contract
+
+- Каждый `CuframesSubscriber` принадлежит **одному Python потоку**.
+  Создание и все вызовы (`next_frame`, `close`) — в одном thread.
+- Несколько subscriber'ов в разных потоках — **OK** (каждому свой handle,
+  свой CUDA stream).
+- `CuframesFrame` тоже принадлежит одному потоку — после `release()` его
+  CUDA pointer становится недействительным, доступ из другого потока —
+  undefined behavior.
+- Внутренний GIL отпускается на блокирующих вызовах (`subscriber_create`,
+  `next_frame`) — другие Python потоки могут выполняться.
+
+Для multi-camera в одном процессе используйте `asyncio` или `threading`:
+
+```python
+import threading
+import cuframes
+
+def worker(camera_key):
+    with cuframes.subscribe(camera_key, connect_timeout_ms=5000) as sub:
+        # subscribe в этом же потоке
+        while True:
+            with sub.next_frame(timeout_ms=1000) as frame:
+                process(frame)
+
+for key in ["cam-parking", "cam-front_yard", "cam-gate_lpr", "cam-back_yard"]:
+    threading.Thread(target=worker, args=(key,), daemon=True).start()
+```
+
+## Error taxonomy
+
+Все exception'ы наследуются от `CuframesError`. Конкретные subclass'ы
+позволяют разную обработку:
+
+| Exception | Когда выбрасывается | Что делать |
+|---|---|---|
+| `CuframesPublisherGone` | publisher умер или ещё не стартовал | reconnect-loop |
+| `CuframesFrameTimeout` | timeout без frame'а | продолжать ждать или log'нуть |
+| `CuframesDeviceLost` | CUDA error на cross-process sync | abort, не recoverable |
+| `CuframesShmError` | socket/mmap/IPC error | log, abort или восстановить |
+| `CuframesProtocolMismatch` | версия libcuframes несовместима | пересобрать |
+| `CuframesInvalidArgument` | bug в caller | fix code |
+| `CuframesOutOfMemory` | cudaMalloc fail | reduce работу |
+| `CuframesInternal` | bug в libcuframes | report |
+
+## Backpressure
+
+`next_frame()` blocking call с GIL released. Если consumer медленнее
+publisher'а:
+- В `NEWEST_ONLY` mode (default) — publisher продолжает писать, consumer
+  получает **самый свежий** frame (промежуточные пропускает). `gap_count`
+  растёт.
+- В `STRICT_ORDER` mode — при ring overflow `CuframesPublisherGone` →
+  reconnect.
+
+Frame удерживать долго **нельзя**: в `STRICT_WAIT` policy publisher
+заблокирует ring. Pattern — забрать DLPack, инициировать GPU работу,
+release frame сразу.
+
+## Текущие ограничения (Phase 0)
+
+- Publisher API не обёрнут (только subscriber-side)
+- Packet ring (encoded video) не обёрнут
+- Async callback API не обёрнут
+- `ring_occupancy` / реальный drop count — нет в C API (counted в pybind как
+  `gap_count`, это proxy)
+- Smoke test реального subscribe требует Docker IPC namespace (cuframes
+  socket/SHM живут в namespace publisher'а)
+
+Эти ограничения снимаются по мере необходимости — issues в
+[gx/cuframes](http://server:3000/gx/cuframes).
@@ -181,4 +181,36 @@ Phase 0 PoC (2026-05-14):
 - **Docker:** 29.1.3 с nvidia-container-runtime
 - **Container:** Ubuntu 24.04 + GCC 13 + Clang + CMake 3.28 + Ninja

-Дополнительный target matrix будет в CI после Phase 4.
+## Production deployment matrix (v0.1.0)
+
+Что подтверждено в 24h+ production run:
+
+| Слой | Версия | Comments |
+|---|---|---|
+| NVIDIA driver | 555+ | минимум для CUDA 12 user runtime |
+| CUDA toolkit (build) | 12.4 (Debian 12 / Ubuntu 22.04) либо 13.0 (Ubuntu 24.04) | toolkit для builder image, не runtime |
+| GPU | RTX 5090 (sm_120) | проверено; раньше — sm_75 минимум |
+| Builder OS | Ubuntu 22.04 (glibc 2.35) | forward-compat с Debian 12 runtime |
+| Runtime OS (Frigate) | Debian 12 (glibc 2.36) | base image Frigate `stable-tensorrt` |
+| Runtime OS (cctv-backend) | Ubuntu 22.04 либо Debian 12 | matched с builder |
+| Docker | 29.1.x | для buildx |
+| docker buildx | v0.34.0+ | `apt install docker-buildx-plugin` либо manual install из GH releases |
+| nvidia-container-toolkit | 1.14+ | для `runtime: nvidia` |
+
+## Docker namespace requirements (cross-container CUDA IPC)
+
+Для consumer'а который подключается к publisher'у в **другом** container'е:
+
+| Что нужно | Как настроить |
+|---|---|
+| `/dev/shm` shared (header + ring metadata) | `ipc: container:<publisher>` либо `ipc: shareable` у publisher + same у consumer |
+| `/proc` visibility (CUDA IPC peer validation) | `pid: container:<publisher>` |
+| `/run/cuframes/*.sock` доступен | volume mount `cuframes_sock:/run/cuframes:ro` |
+| GPU access | `runtime: nvidia` + `NVIDIA_VISIBLE_DEVICES=all` |
+| Socket file permissions | `user: root` либо chmod в publisher |
+
+**Все 5** должны быть выполнены. Подробности — [docs/troubleshooting.md](troubleshooting.md).
+
+**Special case: s6-overlay containers (Frigate, linuxserver.io stack)**: `pid:` share **невозможен** — s6-overlay требует PID 1. Workaround: только `ipc:` + race window connect. См. troubleshooting.
+
+Дополнительный target matrix будет в CI после Phase 4 (см. [ROADMAP.md](../ROADMAP.md)).
@@ -0,0 +1,326 @@
+# Troubleshooting
+
+Реальные грабли которые мы прошли при первой production deployment'е cuframes
+(Frigate + custom C++ processor + custom Python). Документировано чтобы вы их
+не повторяли.
+
+## Содержание
+
+- [Runtime / CUDA IPC](#runtime--cuda-ipc)
+  - [`cudaIpcOpenEventHandle: invalid device context`](#cudaipcopeneventhandle-invalid-device-context)
+  - [Subscriber timeout (`cuframes_subscriber_create: timeout`)](#subscriber-timeout)
+  - [Permission denied на socket](#permission-denied-на-socket)
+- [Frigate-specific](#frigate-specific)
+  - [`s6-overlay-suexec: fatal: can only run as pid 1`](#s6-overlay-suexec-fatal-can-only-run-as-pid-1)
+  - [`No such filter: 'scale_cuda'`](#no-such-filter-scale_cuda)
+  - [Missing dynamic .so после ffmpeg replace](#missing-dynamic-so-после-ffmpeg-replace)
+- [Build / FFmpeg patch](#build--ffmpeg-patch)
+  - [`libcuframes not found` при configure](#libcuframes-not-found-при-configure)
+  - [`ffbuild/library.mak: No such file`](#ffbuildlibrarymak-no-such-file)
+  - [`could not find a working compiler` (GMP)](#could-not-find-a-working-compiler-gmp)
+  - [`zlib: download failed` в crosstool-NG](#zlib-download-failed-в-crosstool-ng)
+  - [`stdbit.h: No such file` при `--enable-cuda-llvm`](#stdbith-no-such-file-при---enable-cuda-llvm)
+- [Docker / IPC](#docker--ipc)
+  - [Cross-container CUDA IPC: ipc + pid namespace share](#cross-container-cuda-ipc-ipc--pid-namespace-share)
+  - [Buildx container driver не видит host images](#buildx-container-driver-не-видит-host-images)
+- [Networking / RTSP](#networking--rtsp)
+  - [RTSP/RTP UDP не доходит до клиента (docker NAT)](#rtsprtp-udp-не-доходит-до-клиента-docker-nat)
+  - [`Nonmatching transport in server reply`](#nonmatching-transport-in-server-reply)
+- [Gitea Actions / CI](#gitea-actions--ci)
+  - [`node: executable file not found`](#node-executable-file-not-found)
+  - [`SyntaxError: Unexpected token '{'` (Node 12)](#syntaxerror-unexpected-token--node-12)
+
+---
+
+## Runtime / CUDA IPC
+
+### `cudaIpcOpenEventHandle: invalid device context`
+
+**Симптом**: subscriber сразу после `cuframes_subscriber_create` падает с этой ошибкой.
+
+**Причина**: CUDA driver проверяет IPC peer через `/proc/<pid>/...`. Если процесс publisher'а **не виден** в PID namespace consumer'а — context считается невалидным.
+
+**Fix**: shared PID namespace.
+
+Docker:
+```yaml
+consumer:
+  ipc: "container:<publisher>"   # shared /dev/shm
+  pid: "container:<publisher>"   # ← вот это критично, без него fail
+```
+
+Host process: запуск consumer'а на host'е (либо publisher'а на host'е тоже) — same default namespace.
+
+**Caveat**: если consumer image использует s6-overlay (Frigate, linuxserver.io
+images) — `pid: container:` несовместим (см. [соответствующую секцию](#s6-overlay-suexec-fatal-can-only-run-as-pid-1)).
+
+### Subscriber timeout
+
+**Симптом**: `cuframes_subscriber_create: timeout` без других ошибок.
+
+**Причины** (в порядке вероятности):
+1. `/run/cuframes/<key>.sock` не виден consumer'у — забыли volume-mount
+2. `/run/cuframes` смонтирован, но publisher ещё не успел создать socket — увеличить `connect_timeout_ms`
+3. Publisher запущен, socket есть, но **permission denied** — см. ниже
+
+### Permission denied на socket
+
+**Симптом**: socket виден через `ls -la /run/cuframes/`, owner `root`. Consumer process — non-root user → не может `connect()`.
+
+**Fix**:
+- Запустить consumer как root: `user: root` в compose
+- Либо изменить permissions socket после создания (publisher delegation) — TBD в v0.2
+
+---
+
+## Frigate-specific
+
+### `s6-overlay-suexec: fatal: can only run as pid 1`
+
+**Симптом**: container Frigate'а в restart loop, в logs только эта ошибка.
+
+**Причина**: `pid: container:<publisher>` сделал Frigate not-PID-1 в shared namespace. s6-overlay v3 strictly требует PID 1 для proper signal handling/zombie reaping.
+
+**Fix**: убрать `pid: container:` для Frigate. Только `ipc: container:` shared.
+
+**Trade-off**: без shared pid некоторые edge cases CUDA IPC ломаются (см. [соответствующую секцию](#cudaipcopeneventhandle-invalid-device-context)). Frigate **на практике** работает потому что подключается до того как CUDA driver проверяет peer (race window race), но если publisher restart'нётся посередине — Frigate'у не удастся пере-подключиться без перезапуска.
+
+**Real fix** (planned v0.2): encoded packet sharing — Frigate detect получает кадры через decoded path (work-around), record получает encoded через socket-based protocol который **не** требует cudaIpcOpenEventHandle.
+
+### `No such filter: 'scale_cuda'`
+
+**Симптом**: Frigate ffmpeg subprocess падает с этой ошибкой в `AVFilterGraph`.
+
+**Причина**: наш patched FFmpeg собран без `--enable-cuda-llvm` (см. [stdbit.h grабля](#stdbith-no-such-file-при---enable-cuda-llvm)). Без cuda-llvm в FFmpeg нет CUDA filters (scale_cuda, overlay_cuda).
+
+**Fix**: в Frigate config.yml явно отключи hwaccel cuda:
+```yaml
+ffmpeg:
+  hwaccel_args: []   # CPU scale вместо scale_cuda
+```
+
+Cost: 5-10% CPU per FHD25 камера. **Real fix** (v0.2): publisher-side resize в cuframes сам.
+
+### Missing dynamic .so после ffmpeg replace
+
+**Симптом**: после `docker cp` patched ffmpeg в Frigate container — `ldd ffmpeg`
+показывает `libharfbuzz.so.0 => not found`, `libfribidi.so.0 => not found`, …
+~20 missing .so.
+
+**Причина**: Frigate's bundled ffmpeg **статически слинкован** (NickM-27/FFmpeg-Builds
+делает full static build). Все 30+ deps встроены в один binary. Frigate runtime
+image **не имеет** этих .so packages installed (ему не надо — bundled ffmpeg
+self-contained).
+
+Наш custom ffmpeg — **dynamic linked** (apt deps). Нужны .so на target.
+
+**Fix**: либо
+- `apt install` missing libs в Frigate (additive image modification):
+  ```bash
+  apt install libharfbuzz0b libfribidi0 librist4 libsrt1.5-openssl libssh-4 \
+              libvpx7 libwebpmux3 libwebp7 libdav1d6 libaom3 libmp3lame0 \
+              libsvtav1enc1 libtheora0 libvorbis0a libvorbisenc2 \
+              libx264-164 libx265-199 libopus0
+  ```
+- Либо строить наш ffmpeg static (sources from NickM-27 pipeline) — complex
+  (см. [zlib download / GMP compiler граблю](#zlib-download-failed-в-crosstool-ng))
+
+Best practice: создать `Dockerfile.frigate` overlay поверх Frigate image,
+который добавляет deps и копирует ffmpeg. Запечь в image, не in-place patch.
+
+---
+
+## Build / FFmpeg patch
+
+### `libcuframes not found` при configure
+
+**Симптом**: FFmpeg configure (с `--enable-libcuframes`) fails с этой ошибкой
+из `enabled libcuframes && require libcuframes ...`. config.log показывает
+`fatal error: cuframes/cuframes.h: No such file or directory`.
+
+**Причины**:
+
+1. **CMake install rules отсутствовали** в libcuframes (early commits до 601806a).
+   `cmake --install` создавал пустой prefix. Fix: обновить cuframes до ≥ 601806a.
+
+2. **Wrong HINTS в find_library**: твой проект ищет в `${CUFRAMES_ROOT}/build/...`
+   но install layout кладёт в `${CUFRAMES_ROOT}/lib`. Добавь оба пути в HINTS.
+
+3. **`rm -f libcuframes.so*`** удалил .so но **.a** file называется
+   `libcuframes_static.a` (не `libcuframes.a`) → linker не находит `-lcuframes`.
+   Fix: либо не удаляй .so, либо переименуй .a при install.
+
+### `ffbuild/library.mak: No such file`
+
+**Симптом**: configure FFmpeg success, но `make` падает сразу:
+`Makefile:123: ffbuild/library.mak: No such file or directory`.
+
+**Причина**: вы сделали ваш fork FFmpeg через snapshot (не git clone), и **случайно
+исключили `ffbuild/`** в rsync. Это **source files** FFmpeg, не build artifacts.
+
+**Fix**: убедись что `ffbuild/` есть в твоём FFmpeg checkout (`ls ffbuild/library.mak`).
+Если делаешь snapshot через rsync — не используй `--exclude=ffbuild`.
+
+### `could not find a working compiler` (GMP)
+
+**Симптом**: crosstool-NG build падает на `Installing GMP for host` с
+`configure: error: could not find a working compiler`. config.log показывает
+`no, long long reliability test 1`.
+
+**Причина**: GMP 6.2.1 имеет known issue с GCC 11+ (Ubuntu 22.04 default).
+Проверка long-long reliability fail'ит false-positive.
+
+**Fix**: pin GMP к 6.3.0 в `ct-ng-config`:
+```
+CT_GMP_V_6_3=y
+# CT_GMP_V_6_2 is not set
+CT_GMP_VERSION="6.3.0"
+```
+
+И убедись что crosstool-NG version (commit) поддерживает 6.3.0 (≥ master 2024-09).
+
+### `zlib: download failed` в crosstool-NG
+
+**Симптом**: crosstool-NG step `Retrieving 'zlib-1.2.12'` fail'ит.
+
+**Причина**: zlib.net убрали старые versions с дефолтного location — теперь они
+только в `/fossils/` subdirectory. Crosstool-NG hardcoded URL не работает.
+
+**Fix**: pre-fetch tarball + положить в local cache:
+```bash
+wget https://zlib.net/fossils/zlib-1.2.12.tar.gz -O preload/zlib-1.2.12.tar.gz
+```
+
+В Dockerfile перед `ct-ng build`:
+```dockerfile
+COPY preload/*.tar.gz /root/src/
+```
+
+`CT_LOCAL_TARBALLS_DIR=${HOME}/src` — crosstool-NG найдёт в cache и не пойдёт
+download.
+
+### `stdbit.h: No such file` при `--enable-cuda-llvm`
+
+**Симптом**: FFmpeg configure с `--enable-cuda-llvm` fail'ит:
+`fatal error: stdbit.h: No such file or directory`. ERROR: cuda_llvm requested
+but not found.
+
+**Причина**: `stdbit.h` — C23 standard header. Доступен в glibc ≥ 2.38.
+
+- Ubuntu 22.04 = glibc 2.35 — **нет**
+- Debian 12 = glibc 2.36 — **нет**
+- Ubuntu 24.04 = glibc 2.39 — есть
+- Debian 13 (trixie) = glibc 2.38+ — есть
+
+**Fix options**:
+1. Build на newer base (Ubuntu 24.04+). Но runtime target (Frigate Debian 12)
+   не запустит binary с glibc-2.38 symbols (backwards-incompatible).
+2. Убрать `--enable-cuda-llvm`. Потеря: CUDA filters (`scale_cuda`, `overlay_cuda`,
+   `hwupload_cuda`). Decode/encode через NVDEC/NVENC всё равно работают.
+3. Дождаться когда Frigate base обновится до newer Debian — вне твоего контроля.
+
+**На практике**: убираем cuda-llvm, в Frigate config `hwaccel_args: []`.
+См. [scale_cuda секцию](#no-such-filter-scale_cuda).
+
+---
+
+## Docker / IPC
+
+### Cross-container CUDA IPC: ipc + pid namespace share
+
+| Что нужно | Compose option |
+|---|---|
+| /dev/shm shared (для cuframes header + SHM ring) | `ipc: container:<publisher>` (либо `ipc: shareable` у publisher + same у consumer) |
+| /proc visibility (для CUDA IPC peer validation) | `pid: container:<publisher>` |
+| `/run/cuframes/*.sock` доступен | volume mount: `cuframes_sock:/run/cuframes:ro` |
+| GPU access | `runtime: nvidia` |
+| Socket permissions | `user: root` (либо chmod socket в publisher) |
+
+**Все 5** должны быть выполнены. Один пропуск — fail при subscriber_create или
+cudaIpcOpenEventHandle.
+
+### Buildx container driver не видит host images
+
+**Симптом**: при использовании custom buildx builder (`docker buildx create
+--driver docker-container ...`) с `FROM local-image:tag` — error `failed to
+authorize: 403 Forbidden` (buildkit пытается pull с registry).
+
+**Причина**: container driver buildx изолирован, не имеет доступа к host's
+local docker daemon images. Pull через registry.
+
+**Fix**: либо
+- Не использовать custom builder — `docker buildx use default` (использует host
+  daemon). Минус: теряем `--cache-to/--cache-from type=local`.
+- Либо push local image в **registry** (local или gitea), и buildx pull'ит оттуда.
+
+---
+
+## Networking / RTSP
+
+### RTSP/RTP UDP не доходит до клиента (docker NAT)
+
+**Симптом**: RTSP server в docker контейнере с `ports: "554:8555"`. Клиент (TV, VLC)
+делает RTSP SETUP successfully (TCP control работает), но video frames не приходят.
+
+**Причина**: RTP идёт **UDP**, sourced из docker network namespace. SNAT MASQUERADE
+для outbound работает, но RTP destination port (которое клиент опубликовал в SETUP)
+**не маппится обратно** через docker bridge — клиент видит UDP packets от чужого
+source IP (docker network 172.x), не от 192.168.88.23 как expected.
+
+**Fix**: `network_mode: host` для RTSP-server контейнера. Тогда server listens
+**напрямую** на host interfaces, RTP packets идут без NAT.
+
+Trade-offs:
+- Все ports app'а listen на host network (нет port mapping). Проверь port collisions.
+- DB env vars (postgres:5432 в docker network DNS) надо менять на host paths
+  (`localhost:5433` если postgres exposed на host port 5433).
+
+### `Nonmatching transport in server reply`
+
+**Симптом**: `ffprobe -rtsp_transport tcp -i rtsp://...` falls с этим сообщением.
+
+**Причина**: RTSP server возвращает SDP с UDP-only transport. Client ожидает TCP
+interleaved.
+
+**Fix**: использовать UDP transport: `-rtsp_transport udp` (либо default behavior).
+Если TV не поддерживает UDP — нужен RTSP server который умеет RTP-over-TCP
+interleaved (cctv-processor v0.1 не умеет).
+
+---
+
+## Gitea Actions / CI
+
+### `node: executable file not found`
+
+**Симптом**: первый JS action (например `actions/checkout@v4`) fail'ит:
+`OCI runtime exec failed: exec: "node": executable file not found in $PATH`.
+
+**Причина**: гитея act_runner запускает JS actions через `node`, но твой
+custom container (например `nvidia/cuda:...`) не имеет node installed.
+
+**Fix**: pre-install node в первом `run:` step (до actions/checkout):
+```yaml
+steps:
+  - name: Bootstrap node
+    run: apt-get update && apt-get install -y nodejs git ca-certificates
+  - name: Checkout
+    uses: actions/checkout@v4
+```
+
+Либо использовать container с node pre-installed (`docker.gitea.com/runner-images:ubuntu-22.04`).
+
+### `SyntaxError: Unexpected token '{'` (Node 12)
+
+**Симптом**: после `apt install nodejs` в Ubuntu 22.04 — actions/checkout@v4 fail'ит:
+`SyntaxError: Unexpected token '{' at static {...}`.
+
+**Причина**: Ubuntu 22.04 apt'овский `nodejs` = Node **12**. `actions/checkout@v4`
+скомпилирован для Node 20+ (static class blocks — ES2022).
+
+**Fix**: install Node 20 from NodeSource:
+```bash
+curl -fsSL https://deb.nodesource.com/setup_20.x | bash -
+apt-get install -y nodejs
+```
+
+В Ubuntu 24.04 apt уже даёт Node 20 — там goes автоматически.
@@ -0,0 +1,8 @@
+# Скопировать в .env (не commit'ить!)
+# .env должен быть в .gitignore
+
+# Камеры: пароли admin user'а на Dahua/Hikvision/etc
+CAM_PARKING_PASS=changeme
+
+# Frigate API/UI auth password
+FRIGATE_RTSP_PASSWORD=changeme
@@ -0,0 +1,61 @@
+# examples/frigate-compose
+
+Reference docker-compose для Frigate + cuframes integration. **НЕ** копировать
+в production бездумно — это шаблон, адаптируй под свою инфру (IP-адреса камер,
+пароли, mount paths, network).
+
+## Quickstart
+
+1. Build patched Frigate image (single-time setup, ~15 мин):
+   ```bash
+   # См. docs/integrations/frigate.md, Шаг 1 — там полный Dockerfile.
+   docker build -t local/frigate-cuframes:latest -f Dockerfile.frigate .
+   ```
+
+2. Pull cuframes publisher image:
+   ```bash
+   docker pull git.goldix.org/gx/cuframes:0.1
+   # либо собрать local: docker build -t local/cuframes:0.1 -f docker/Dockerfile.runtime ../..
+   ```
+
+3. Скопировать .env:
+   ```bash
+   cp .env.example .env
+   $EDITOR .env   # подставь свои camera passwords
+   ```
+
+4. Адаптировать `docker-compose.yml`:
+   - `parking-cam-ip` → реальный IP камеры
+   - `--key cam-parking` → имя по вкусу (должно matche'ить config.yml `cuframes://<key>`)
+   - `cam-parking` в Frigate config → так же matched
+
+5. Адаптировать `config/config.yml`:
+   - детектор (cpu / onnx / tensorrt)
+   - пути к media
+   - дополнительные камеры если нужно
+
+6. Run:
+   ```bash
+   docker compose up -d
+   docker logs -f frigate
+   # UI: http://localhost:5000 (internal) либо https://localhost:8971 (auth)
+   ```
+
+## Что демонстрирует
+
+- Один publisher (`cuframes-pub-parking`) делает 1× NVDEC на parking-камеру
+- Frigate подключается к publisher через `ipc:container:` + `cuframes://` URL
+- Frigate **не** делает свой NVDEC для detect-path — берёт готовые NV12 frames
+
+## Что НЕ демонстрирует
+
+- Record path — Frigate всё ещё открывает второй RTSP к камере (для архива
+  `-c:v copy` mux). v0.2 cuframes решит через encoded packet sharing
+  (см. [issue #2](https://git.goldix.org/gx/cuframes/issues/2))
+- Multi-camera setup — добавь больше publisher'ов и camera-blocks в config.yml
+- HA/MQTT интеграция — добавь свой mqtt block
+
+## См. также
+
+- [docs/integrations/frigate.md](../../docs/integrations/frigate.md) — полный walkthrough
+- [docs/integration.md](../../docs/integration.md) — общая интеграция
@@ -0,0 +1,49 @@
+# Minimal Frigate config с cuframes integration.
+# Полный guide: docs/integrations/frigate.md
+
+mqtt:
+  enabled: false
+
+detectors:
+  # Замени на свой detector (tensorrt / onnx / cpu). Здесь — placeholder.
+  cpu:
+    type: cpu
+
+# CRITICAL: hwaccel cuda отключён — наш patched ffmpeg без --enable-cuda-llvm
+# (не работает на glibc < 2.38 что у Debian 12, на котором Frigate runtime).
+# Без cuda-llvm нет scale_cuda filter. Detect-path использует CPU scale, но
+# decode уже сделан у publisher'а — net выигрыш всё равно.
+ffmpeg:
+  hwaccel_args: []
+  output_args:
+    record: preset-record-generic-audio-aac
+
+cameras:
+  parking_overview:
+    enabled: true
+    friendly_name: Парковка
+    ffmpeg:
+      inputs:
+        # main (full-res) — только запись в архив через прямой RTSP (`-c:v copy`, no decode у Frigate)
+        # После cuframes v0.2 этот path тоже может через cuframes_packets:// (encoded share)
+        - path: rtsp://admin:${FRIGATE_RTSP_PASSWORD}@parking-cam-ip:554/cam/realmonitor?channel=1&subtype=0
+          roles: [record]
+
+        # sub-stream → через cuframes (decoded у publisher'а, без второго NVDEC)
+        - path: cuframes://cam-parking
+          input_args: -f cuframes
+          roles: [detect]
+    detect:
+      width: 640
+      height: 480
+      fps: 5
+
+record:
+  enabled: true
+  retain:
+    days: 7
+
+snapshots:
+  enabled: true
+  retain:
+    default: 7
@@ -0,0 +1,73 @@
+# Reference docker-compose для Frigate + cuframes integration.
+# Полный guide: docs/integrations/frigate.md
+#
+# Что нужно подготовить заранее:
+#   1. Build local image local/frigate-cuframes:latest по Dockerfile.frigate
+#      (см. docs/integrations/frigate.md, Шаг 1)
+#   2. Pull cuframes runtime image:
+#      docker pull git.goldix.org/gx/cuframes:0.1   # либо собрать local
+#   3. Скопировать config/config.yml (placeholder в config/ рядом)
+#   4. .env с CAM_PARKING_PASS=... и FRIGATE_RTSP_PASSWORD=...
+#
+# Запуск:
+#   docker compose up -d
+#   # UI: http://host:5000 (internal, без auth) либо https://host:8971 (with auth)
+
+services:
+  # 1× publisher на камеру — single source of RTSP + NVDEC
+  cuframes-pub-parking:
+    image: git.goldix.org/gx/cuframes:0.1
+    container_name: cuframes-pub-parking
+    restart: unless-stopped
+    runtime: nvidia
+    ipc: shareable
+    shm_size: 256m
+    environment:
+      NVIDIA_VISIBLE_DEVICES: all
+      NVIDIA_DRIVER_CAPABILITIES: compute,video,utility
+    volumes:
+      - cuframes_sock:/run/cuframes
+    command:
+      - /usr/local/bin/cuframes-rtsp-source
+      - --rtsp
+      # Используем sub-stream для detect-path (lighter resolution, тот же camera load)
+      - "rtsp://admin:${CAM_PARKING_PASS}@parking-cam-ip:554/cam/realmonitor?channel=1&subtype=1"
+      - --key
+      - cam-parking
+      - --ring
+      - "6"
+      - --verbose
+
+  frigate:
+    image: local/frigate-cuframes:latest   # см. docs/integrations/frigate.md Шаг 1
+    container_name: frigate
+    restart: unless-stopped
+    depends_on:
+      cuframes-pub-parking:
+        condition: service_started
+    runtime: nvidia
+    privileged: true
+    shm_size: 512m
+    # WARN: только ipc share — pid НЕ shared (Frigate's s6-overlay требует PID 1).
+    # Frigate подсоединяется к first CUDA context publisher'а в shared /dev/shm.
+    ipc: "container:cuframes-pub-parking"
+    environment:
+      FRIGATE_RTSP_PASSWORD: "${FRIGATE_RTSP_PASSWORD}"
+      NVIDIA_VISIBLE_DEVICES: all
+      NVIDIA_DRIVER_CAPABILITIES: compute,video,utility
+    ports:
+      - "5000:5000"        # UI без auth (internal, не expose external!)
+      - "8971:8971"        # UI с HTTPS + auth
+      - "8554:8554"        # RTSP restream (go2rtc)
+      - "8555:8555/tcp"
+      - "8555:8555/udp"
+    volumes:
+      - cuframes_sock:/run/cuframes:ro
+      - ./config/config.yml:/config/config.yml:ro
+      - ./media:/media/frigate
+      - type: tmpfs
+        target: /tmp/cache
+        tmpfs: { size: 1000000000 }
+
+volumes:
+  cuframes_sock:
@@ -0,0 +1,78 @@
+# examples/python-consumer
+
+Reference Python consumer для cuframes через `ctypes` wrapper.
+
+## Use case
+
+AI/ML pipeline (PyTorch / ONNX / TensorRT) которому нужны декодированные кадры
+с камер. Без cuframes — каждый Python скрипт открывает RTSP + decode сам.
+С cuframes — подписывается на готовые NV12 frames от publisher'а.
+
+## Запуск
+
+```bash
+# Publisher должен быть запущен (см. tools/cuframes-rtsp-source или Docker image)
+cuframes-rtsp-source --rtsp rtsp://admin:pw@cam-ip:554/... --key cam-parking &
+
+# Consumer (same host, либо same docker namespace — см. требования ниже)
+python3 cuframes_consumer.py --key cam-parking --max-frames 100
+```
+
+Ожидаемый output:
+```
+[consumer] connected to 'cam-parking'
+[consumer] first frame: 640x480 NV12, pitch_y=640, pitch_uv=640, cuda_ptr=0x...
+[consumer] received=25 seq=42 pts_ms=...
+...
+=== RESULT ===
+received: 100 / 100
+elapsed: 3.96s
+avg_fps: 25.03
+```
+
+## Что этот пример НЕ делает
+
+- **НЕ копирует** GPU NV12 frame на host — `cuda_ptr` это raw CUDA device pointer.
+  Для реальной работы нужно:
+  - `pycuda` / `cupy` / `cuda-python` библиотека для CUDA memcpy
+  - либо передать `cuda_ptr` напрямую в GPU-aware ML framework (PyTorch's
+    `torch.cuda.IntTensor.from_dlpack` etc.)
+
+- **НЕ конвертирует** NV12 → RGB. Используй `cv2.cvtColor(nv12, cv2.COLOR_YUV2RGB_NV12)`
+  на host или GPU-side conversion.
+
+- **НЕ обрабатывает** inference — это skeleton, в твоём pipeline replace
+  comment-block `### ВАШ ML PIPELINE ЗДЕСЬ ###` с актуальным кодом.
+
+## Требования
+
+| | Значение |
+|---|---|
+| Python | 3.8+ |
+| `libcuframes.so.0` | в `LD_LIBRARY_PATH` (либо `/usr/local/lib`) |
+| Publisher running | да, с matching `--key` |
+| Same IPC namespace | да (host либо `ipc:container:<publisher>` в docker) |
+| Same PID namespace | да (host либо `pid:container:<publisher>` в docker) |
+| NVIDIA GPU + driver | для access `cuda_ptr` (read-only frame от publisher'а) |
+
+## Docker-style
+
+```yaml
+# В compose рядом с publisher service
+ai-pipeline:
+  image: your-ai-image:cuda
+  runtime: nvidia
+  ipc: "container:cuframes-pub-parking"
+  pid: "container:cuframes-pub-parking"
+  volumes:
+    - cuframes_sock:/run/cuframes:ro
+  environment:
+    LD_LIBRARY_PATH: /usr/local/lib
+  command: python3 /app/cuframes_consumer.py --key cam-parking --max-frames 1000000
+```
+
+## v0.3 → first-class pybind11 bindings
+
+Текущий ctypes pattern будет заменён на native pybind11 bindings в v0.3 cuframes
+([ROADMAP.md](../../ROADMAP.md)). Тогда API будет более pythonic + zero-copy через
+`__cuda_array_interface__` / `dlpack`.
@@ -0,0 +1,206 @@
+#!/usr/bin/env python3
+"""
+Reference Python consumer для cuframes (через ctypes wrapper).
+
+До v0.3 (когда появятся первоклассные pybind11 bindings) — это minimal
+working pattern для AI/ML скриптов которые хотят подписаться на cuframes IPC.
+
+Pattern:
+  1. subscribe to cuframes (open libcuframes.so via ctypes)
+  2. в цикле: получить next() frame
+  3. cudaMemcpy → host (через pycuda либо отдельной CUDA-Python библиотекой)
+  4. передать в свой ML pipeline (ONNX/TensorRT/PyTorch)
+  5. release frame обратно publisher'у
+
+Limitations:
+  - Этот skeleton НЕ делает actual CUDA copy (нужна pycuda / cupy / cuda-python)
+  - Только sync API
+  - Только NV12 (v0.1)
+
+Запуск:
+  python3 cuframes_consumer.py --key cam-parking --max-frames 100
+
+Требования (на target host):
+  - libcuframes.so в LD_LIBRARY_PATH (либо apt install / docker)
+  - publisher запущен (cuframes-rtsp-source --key cam-parking ...)
+  - same IPC + PID namespace что publisher (если в docker — ipc:container: + pid:container:)
+"""
+
+import argparse
+import ctypes
+import sys
+import time
+from ctypes import c_int, c_int32, c_int64, c_uint64, c_uint32, c_char_p, c_void_p, c_size_t, POINTER, Structure
+
+
+# ─── C API bindings ─────────────────────────────────────────────────────
+
+# Error codes
+CUFRAMES_OK = 0
+CUFRAMES_ERR_TIMEOUT = -7
+CUFRAMES_ERR_WOULD_BLOCK = -11
+CUFRAMES_ERR_DISCONNECTED = -9
+
+# Modes
+CUFRAMES_MODE_NEWEST_ONLY = 0
+CUFRAMES_MODE_STRICT_ORDER = 1
+
+# Pixel format
+CUFRAMES_FORMAT_NV12 = 0
+
+
+class SubscriberConfig(Structure):
+    """Соответствует C struct cuframes_subscriber_config."""
+    _fields_ = [
+        ("key", c_char_p),
+        ("consumer_name", c_char_p),
+        ("mode", c_int),
+        ("cuda_device", c_int32),
+        ("connect_timeout_ms", c_int32),
+        ("_reserved", c_uint64 * 4),
+    ]
+
+
+def _load_libcuframes():
+    """Загрузить libcuframes.so + bind ctypes signatures."""
+    try:
+        lib = ctypes.CDLL("libcuframes.so.0")
+    except OSError as e:
+        sys.stderr.write(f"Cannot load libcuframes.so.0: {e}\n")
+        sys.stderr.write("Установи libcuframes (см. cuframes README) и убедись что .so в LD_LIBRARY_PATH.\n")
+        sys.exit(1)
+
+    # cuframes_strerror
+    lib.cuframes_strerror.argtypes = [c_int]
+    lib.cuframes_strerror.restype = c_char_p
+
+    # cuframes_subscriber_create
+    lib.cuframes_subscriber_create.argtypes = [POINTER(SubscriberConfig), POINTER(c_void_p)]
+    lib.cuframes_subscriber_create.restype = c_int
+
+    # cuframes_subscriber_next  (consumer_stream=NULL — sync API, default stream)
+    lib.cuframes_subscriber_next.argtypes = [c_void_p, c_void_p, POINTER(c_void_p), c_int32]
+    lib.cuframes_subscriber_next.restype = c_int
+
+    # cuframes_subscriber_release
+    lib.cuframes_subscriber_release.argtypes = [c_void_p, c_void_p]
+    lib.cuframes_subscriber_release.restype = c_int
+
+    # cuframes_subscriber_destroy
+    lib.cuframes_subscriber_destroy.argtypes = [c_void_p]
+    lib.cuframes_subscriber_destroy.restype = c_int
+
+    # cuframes_frame_* accessors
+    lib.cuframes_frame_cuda_ptr.argtypes = [c_void_p]
+    lib.cuframes_frame_cuda_ptr.restype = c_void_p
+
+    lib.cuframes_frame_size.argtypes = [c_void_p, POINTER(c_int32), POINTER(c_int32)]
+    lib.cuframes_frame_size.restype = None
+
+    lib.cuframes_frame_pitch_y.argtypes = [c_void_p]
+    lib.cuframes_frame_pitch_y.restype = c_int32
+
+    lib.cuframes_frame_pitch_uv.argtypes = [c_void_p]
+    lib.cuframes_frame_pitch_uv.restype = c_int32
+
+    lib.cuframes_frame_seq.argtypes = [c_void_p]
+    lib.cuframes_frame_seq.restype = c_uint64
+
+    lib.cuframes_frame_pts_ns.argtypes = [c_void_p]
+    lib.cuframes_frame_pts_ns.restype = c_int64
+
+    return lib
+
+
+# ─── Main consumer loop ────────────────────────────────────────────────
+
+def main():
+    ap = argparse.ArgumentParser(description="Reference cuframes Python consumer")
+    ap.add_argument("--key", required=True, help="publisher key (e.g. cam-parking)")
+    ap.add_argument("--max-frames", type=int, default=100, help="N frames to receive (default 100)")
+    ap.add_argument("--cuda-device", type=int, default=0)
+    ap.add_argument("--timeout-ms", type=int, default=1000, help="per-frame timeout")
+    args = ap.parse_args()
+
+    lib = _load_libcuframes()
+
+    # Configure subscriber
+    cfg = SubscriberConfig()
+    cfg.key = args.key.encode("utf-8")
+    cfg.consumer_name = None  # auto-generated
+    cfg.mode = CUFRAMES_MODE_NEWEST_ONLY
+    cfg.cuda_device = args.cuda_device
+    cfg.connect_timeout_ms = 5000
+
+    sub_handle = c_void_p()
+    rc = lib.cuframes_subscriber_create(ctypes.byref(cfg), ctypes.byref(sub_handle))
+    if rc != CUFRAMES_OK:
+        sys.stderr.write(f"subscribe failed: {lib.cuframes_strerror(rc).decode()}\n")
+        sys.exit(1)
+
+    print(f"[consumer] connected to '{args.key}'")
+
+    received = 0
+    first_pts = None
+    start_wall = None
+
+    try:
+        while received < args.max_frames:
+            frame_handle = c_void_p()
+            rc = lib.cuframes_subscriber_next(sub_handle, None, ctypes.byref(frame_handle),
+                                              args.timeout_ms)
+
+            if rc == CUFRAMES_ERR_TIMEOUT or rc == CUFRAMES_ERR_WOULD_BLOCK:
+                continue
+            if rc == CUFRAMES_ERR_DISCONNECTED:
+                print(f"[consumer] publisher disconnected — exit")
+                break
+            if rc != CUFRAMES_OK or not frame_handle.value:
+                sys.stderr.write(f"next failed: {lib.cuframes_strerror(rc).decode()}\n")
+                break
+
+            # Frame metadata
+            w, h = c_int32(0), c_int32(0)
+            lib.cuframes_frame_size(frame_handle, ctypes.byref(w), ctypes.byref(h))
+            pitch_y = lib.cuframes_frame_pitch_y(frame_handle)
+            pitch_uv = lib.cuframes_frame_pitch_uv(frame_handle)
+            cuda_ptr = lib.cuframes_frame_cuda_ptr(frame_handle)
+            seq = lib.cuframes_frame_seq(frame_handle)
+            pts_ns = lib.cuframes_frame_pts_ns(frame_handle)
+
+            if first_pts is None:
+                first_pts = pts_ns
+                start_wall = time.monotonic()
+                print(f"[consumer] first frame: {w.value}x{h.value} NV12, "
+                      f"pitch_y={pitch_y}, pitch_uv={pitch_uv}, cuda_ptr=0x{cuda_ptr:x}")
+
+            # ─── ВАШ ML PIPELINE ЗДЕСЬ ────────────────────────────
+            # 1. cudaMemcpy NV12 frame → host (или используй pycuda / cupy для in-GPU pipeline)
+            # 2. NV12 → RGB conversion (CPU либо GPU)
+            # 3. inference: model(frame) → results
+            # 4. publish results (mqtt / API / etc)
+            #
+            # В этом skeleton — просто counter.
+            received += 1
+            if received % 25 == 0:
+                print(f"[consumer] received={received} seq={seq} pts_ms={pts_ns // 1_000_000}")
+
+            # CRITICAL: release frame ОБЯЗАТЕЛЬНО — иначе publisher застрянет
+            # (или drop new frames при ring overflow в STRICT_ORDER mode).
+            lib.cuframes_subscriber_release(sub_handle, frame_handle)
+
+    finally:
+        lib.cuframes_subscriber_destroy(sub_handle)
+
+    if received > 1 and start_wall:
+        elapsed = time.monotonic() - start_wall
+        fps = (received - 1) / elapsed if elapsed > 0 else 0
+        print(f"\n=== RESULT ===")
+        print(f"received: {received} / {args.max_frames}")
+        print(f"elapsed:  {elapsed:.2f}s")
+        print(f"avg_fps:  {fps:.2f}")
+    sys.exit(0 if received >= args.max_frames else 1)
+
+
+if __name__ == "__main__":
+    main()
@@ -36,7 +36,7 @@ extern "C" {
 /* ─────────────────────────────────────────────────────────────────────── */

 #define CUFRAMES_VERSION_MAJOR 0
-#define CUFRAMES_VERSION_MINOR 1
+#define CUFRAMES_VERSION_MINOR 4
 #define CUFRAMES_VERSION_PATCH 0

 /** @brief Runtime-версия библиотеки в формате "MAJOR.MINOR.PATCH". */
@@ -65,6 +65,11 @@ typedef enum cuframes_error {
                                                 несовпадение размеров frame'а */
    CUFRAMES_ERR_WOULD_BLOCK     =  -11,  /**< non-blocking call — no data yet */
    CUFRAMES_ERR_TOO_MANY        =  -12,  /**< превышен MAX_SUBSCRIBERS (32) */
+    /* v0.2 — packet ring (см. docs/protocol.md §10.15) */
+    CUFRAMES_ERR_PACKET_OVERSIZED  =  -20,  /**< publish_packet size > max_packet_size */
+    CUFRAMES_ERR_NO_PACKET_RING    =  -21,  /**< subscriber запросил packets, у publisher'а нет ring'а */
+    CUFRAMES_ERR_NO_CODEC_PARAMS   =  -22,  /**< extradata ещё не set publisher'ом */
+    CUFRAMES_ERR_PACKET_OVERRUN    =  -23,  /**< slow subscriber, packet seq уехал — resync на keyframe */
    CUFRAMES_ERR_INTERNAL        = -100,  /**< bug в библиотеке — repro и reportить */
 } cuframes_error_t;

@@ -366,6 +371,142 @@ int cuframes_async_subscriber_create(const cuframes_subscriber_config_t *cfg,
 */
 int cuframes_async_subscriber_destroy(cuframes_async_subscriber_t *sub);

+/* ─────────────────────────────────────────────────────────────────────── */
+/* Encoded packet ring API (v0.2 — см. docs/protocol.md §10)                */
+/* ─────────────────────────────────────────────────────────────────────── */
+
+/** Packet flags — биты соответствуют AV_PKT_FLAG_* у FFmpeg. */
+#define CUFRAMES_PKT_FLAG_KEY            0x01u  /**< IDR / keyframe */
+#define CUFRAMES_PKT_FLAG_CORRUPT        0x02u  /**< RTP loss / damage */
+#define CUFRAMES_PKT_FLAG_DISCONTINUITY  0x04u  /**< gap before this packet */
+#define CUFRAMES_PKT_FLAG_LAST_IN_AU     0x08u  /**< последний NAL в access unit */
+
+typedef struct cuframes_packet_ring_options {
+    /** Слотов в индексе ring'а. Default 64 (≈ 2 sec @ 30fps + GOP). */
+    uint32_t ring_slots;
+    /** Размер data section ring'а в байтах. Default 8 MiB. */
+    uint32_t data_size;
+    /** Sanity guard — publisher отклонит packet > этого. Default 2 MiB. */
+    uint32_t max_packet_size;
+    /** FFmpeg AV_CODEC_ID_* (H.264 = 27, HEVC = 173). */
+    uint32_t codec_id;
+    uint64_t _reserved[4];
+} cuframes_packet_ring_options_t;
+
+/**
+ * @brief Активировать encoded packet ring на существующем publisher'е.
+ *
+ * Создаёт дополнительный SHM `/dev/shm/cuframes-<key>-packets`. После
+ * этого call'а publisher шлёт packets через `cuframes_publisher_publish_packet`.
+ *
+ * Должно быть вызвано **до** первого `publish_packet` и желательно до того
+ * как subscribers начнут подключаться (иначе они увидят publisher без packet
+ * ring и не получат packets).
+ *
+ * @param pub
+ * @param opts  NULL = default sizing (64 slots, 8MiB data, 2MiB max). codec_id=0 = unknown.
+ * @return CUFRAMES_ERR_ALREADY_EXISTS если ring уже активирован
+ */
+int cuframes_publisher_enable_packets(cuframes_publisher_t *pub,
+                                       const cuframes_packet_ring_options_t *opts);
+
+/**
+ * @brief Установить codec extradata (SPS/PPS/VPS) для packet ring.
+ *
+ * Subscribers (FFmpeg demuxer) читают extradata из shared header и подставляют
+ * в AVCodecContext.extradata. Должно быть вызвано до того как subscribers
+ * захотят decode.
+ *
+ * @param size  ≤ 4096 байт (CUFRAMES_PKT_EXTRADATA_MAX)
+ */
+int cuframes_publisher_set_codec_extradata(cuframes_publisher_t *pub,
+                                            const void *extradata, size_t size);
+
+/**
+ * @brief Опубликовать encoded packet (H.264/H.265 NAL units, Annex B).
+ *
+ * Slow consumer = overwrite oldest. Late subscriber resync'нется на last
+ * keyframe (см. docs/protocol.md §10.14).
+ *
+ * @param flags  CUFRAMES_PKT_FLAG_* (минимум KEY на IDR — критично!)
+ * @return CUFRAMES_ERR_NO_PACKET_RING если не вызывали enable_packets
+ * @return CUFRAMES_ERR_PACKET_OVERSIZED если size > max_packet_size
+ */
+int cuframes_publisher_publish_packet(cuframes_publisher_t *pub,
+                                       const void *data, size_t size,
+                                       int64_t pts_ns, int64_t dts_ns,
+                                       uint32_t flags);
+
+/* ── Subscriber-side packet API ───────────────────────────────────────── */
+
+/** Opaque packet handle. Освобождается через release_packet. */
+typedef struct cuframes_packet cuframes_packet_t;
+
+/** @brief Pointer на encoded NAL bytes. Valid до release_packet. */
+const void *cuframes_packet_data(const cuframes_packet_t *p);
+
+/** @brief Размер payload в байтах. */
+size_t cuframes_packet_size(const cuframes_packet_t *p);
+
+/** @brief Presentation timestamp (наносекунды). */
+int64_t cuframes_packet_pts(const cuframes_packet_t *p);
+
+/** @brief Decode timestamp (для B-frames pipelines). */
+int64_t cuframes_packet_dts(const cuframes_packet_t *p);
+
+/** @brief Биты CUFRAMES_PKT_FLAG_*. */
+uint32_t cuframes_packet_flags(const cuframes_packet_t *p);
+
+/** @brief Sequence number у publisher'а. */
+uint64_t cuframes_packet_seq(const cuframes_packet_t *p);
+
+/**
+ * @brief Активировать чтение packet ring на subscriber'е.
+ *
+ * Открывает SHM `/dev/shm/cuframes-<key>-packets` (тот же `key` что в config).
+ * После этого можно читать через `cuframes_subscriber_next_packet`.
+ *
+ * Subscriber может одновременно иметь frames ring и packets ring (или один из).
+ *
+ * @return CUFRAMES_ERR_NOT_FOUND если publisher не имеет packet ring
+ */
+int cuframes_subscriber_enable_packets(cuframes_subscriber_t *sub);
+
+/**
+ * @brief Получить следующий packet.
+ *
+ * Late subscriber (первый вызов) начинает с last_keyframe_seq publisher'а —
+ * decoder receive'нет valid stream без glitches.
+ *
+ * Полученный packet ОБЯЗАТЕЛЬНО освободить через
+ * cuframes_subscriber_release_packet().
+ *
+ * @param timeout_ms <0 = блокироваться, 0 = non-blocking (WOULD_BLOCK), >0 = с таймаутом
+ * @return CUFRAMES_ERR_PACKET_OVERRUN — subscriber отстал, resync на keyframe (library сделает автоматически на next call)
+ * @return CUFRAMES_ERR_DISCONNECTED — publisher shutdown
+ */
+int cuframes_subscriber_next_packet(cuframes_subscriber_t *sub,
+                                     cuframes_packet_t **pkt_out,
+                                     int32_t timeout_ms);
+
+/** @brief Освободить packet handle. NULL-safe. */
+int cuframes_subscriber_release_packet(cuframes_subscriber_t *sub,
+                                        cuframes_packet_t *pkt);
+
+/**
+ * @brief Получить codec parameters publisher'а.
+ *
+ * `*extradata_out` — pointer в библиотечный buffer, valid пока subscriber жив.
+ * Caller должен скопировать данные если хочет hold past subscriber lifetime.
+ *
+ * @return CUFRAMES_ERR_NO_CODEC_PARAMS если publisher ещё не вызвал
+ *         set_codec_extradata
+ */
+int cuframes_subscriber_get_codec_params(cuframes_subscriber_t *sub,
+                                          uint32_t *codec_id_out,
+                                          const void **extradata_out,
+                                          size_t *extradata_size_out);
+
 /* ─────────────────────────────────────────────────────────────────────── */
 /* Утилиты                                                                 */
 /* ─────────────────────────────────────────────────────────────────────── */
@@ -148,6 +148,23 @@ public:
              "Publisher::publish_external");
    }

+    /* v0.2 — encoded packet ring */
+    void enable_packets(const cuframes_packet_ring_options_t *opts = nullptr) {
+        check(cuframes_publisher_enable_packets(pub_, opts),
+              "Publisher::enable_packets");
+    }
+
+    void set_codec_extradata(const void *data, size_t size) {
+        check(cuframes_publisher_set_codec_extradata(pub_, data, size),
+              "Publisher::set_codec_extradata");
+    }
+
+    /* Returns CUFRAMES_OK / negative error code (без throw — caller решает). */
+    int publish_packet(const void *data, size_t size,
+                        int64_t pts_ns, int64_t dts_ns, uint32_t flags) noexcept {
+        return cuframes_publisher_publish_packet(pub_, data, size, pts_ns, dts_ns, flags);
+    }
+
    cuframes_publisher_t *raw() noexcept { return pub_; }

 private:
@@ -10,6 +10,7 @@ set(CUFRAMES_SOURCES
    src/producer.c
    src/consumer.c
    src/consumer_async.c
+    src/packet_ring.c
 )

 add_library(cuframes SHARED ${CUFRAMES_SOURCES})
@@ -18,7 +19,7 @@ add_library(cuframes_static STATIC ${CUFRAMES_SOURCES})
 foreach(target cuframes cuframes_static)
    target_include_directories(${target}
        PUBLIC
-            $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/include>
+            $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
            $<INSTALL_INTERFACE:include>
        PRIVATE
            ${CMAKE_CURRENT_SOURCE_DIR}/src
@@ -33,6 +34,7 @@ foreach(target cuframes cuframes_static)
    target_link_libraries(${target}
        PUBLIC
            CUDA::cudart
+            CUDA::cuda_driver   # v0.4 — cuMemCreate/cuMemMap/cuMemExportToShareableHandle
            Threads::Threads
            rt   # для shm_open
    )
@@ -40,7 +42,7 @@ endforeach()

 # Set SOVERSION на shared lib для ABI tracking
 set_target_properties(cuframes PROPERTIES
-    VERSION 0.1.0
+    VERSION 0.4.0
    SOVERSION 0
 )

@@ -1,4 +1,13 @@
-/* Subscriber implementation (sync). */
+/* Subscriber implementation (sync).
+ *
+ * v0.4 — VMM + POSIX FD. Принимает FDs через SCM_RIGHTS в handshake,
+ * импортирует через cuMemImportFromShareableHandle + cuMemMap. Не требует
+ * shared pid/ipc namespace с producer'ом.
+ *
+ * Sync: producer cuStreamSynchronize'ит свой stream перед atomic_store(seq).
+ * Consumer просто читает seq (acquire) и копирует данные через DtoD memcpy —
+ * никаких cudaEventWait не нужно (HW coherence на одном GPU).
+ */

 #include "internal.h"
 #include <errno.h>
@@ -6,6 +15,7 @@
 #include <sys/mman.h>
 #include <sys/socket.h>
 #include <sys/un.h>
+#include <time.h>
 #include <unistd.h>

 /* Opaque frame — выдаётся subscriber'у на next() */
@@ -20,7 +30,17 @@ struct cuframes_frame {
    int64_t pts_ns;

    uint32_t slot_idx;
-    void *subscriber;        /* back-ref для release() */
+    void *subscriber;
+};
+
+struct cuframes_packet {
+    uint8_t *data;
+    size_t   capacity;
+    size_t   size;
+    int64_t  pts_ns;
+    int64_t  dts_ns;
+    uint32_t flags;
+    uint64_t seq;
 };

 struct cuframes_subscriber {
@@ -32,18 +52,31 @@ struct cuframes_subscriber {
    cuframes_shm_header_t *hdr;
    char shm_name[80];

-    cudaEvent_t producer_event;
-    void *mapped_ptrs[CUFRAMES_MAX_RING];
+    /* v0.4 — VMM imported slots */
+    CUmemGenericAllocationHandle vmm_handles[CUFRAMES_MAX_RING];
+    CUdeviceptr                  vmm_ptrs[CUFRAMES_MAX_RING];
+    size_t                       vmm_slot_size;
+    int                          imported_count;

    uint32_t assigned_bit;
    uint64_t last_seen_seq;

-    /* Frame pool — переиспользуем одну frame_t structure (single-thread API).
-     * Опционально расширим до lock-free pool в v0.2 если нужен multi-frame. */
    struct cuframes_frame frame_obj;
    int frame_busy;
+
+    int                 has_pkt_ring;
+    cuframes_pkt_ring_t pkt_ring;
+    uint64_t            last_packet_seq;
+    struct cuframes_packet packet_obj;
+    int                 packet_busy;
 };

+static const char *cu_err_str(CUresult r) {
+    const char *s = NULL;
+    cuGetErrorString(r, &s);
+    return s ? s : "?";
+}
+
 /* ─── Frame accessors ────────────────────────────────────────────────── */
 void *cuframes_frame_cuda_ptr(const cuframes_frame_t *f) { return f ? f->cuda_ptr : NULL; }
 cuframes_format_t cuframes_frame_format(const cuframes_frame_t *f) { return f ? f->format : 0; }
@@ -59,11 +92,13 @@ int64_t cuframes_frame_pts_ns(const cuframes_frame_t *f) { return f ? f->pts_ns

 /* ─── Subscriber create ──────────────────────────────────────────────── */

-static int do_handshake(struct cuframes_subscriber *sub, const char *name) {
-    /* Send HELLO_REQ */
+static int do_handshake(struct cuframes_subscriber *sub, const char *name,
+                         int *fds_out, uint32_t *fd_count_inout,
+                         uint64_t *slot_size_out) {
+    /* Send HELLO_REQ — proto v4 */
    uint8_t buf[CUFRAMES_MAX_MSG_PAYLOAD];
    cuframes_msg_hello_req_t *hreq = (cuframes_msg_hello_req_t *)buf;
-    hreq->proto_version = CUFRAMES_PROTOCOL_V1;
+    hreq->proto_version = CUFRAMES_PROTOCOL_V4;
    uint32_t nl = name ? (uint32_t)strlen(name) : 0;
    if (nl > 31) nl = 31;
    hreq->consumer_name_len = nl;
@@ -80,7 +115,6 @@ static int do_handshake(struct cuframes_subscriber *sub, const char *name) {
                                         buf, plen);
    if (r != CUFRAMES_OK) return r;

-    /* Recv HELLO_RESP */
    uint32_t rmt = 0, rpl = sizeof(buf);
    r = cuframes_internal_recv_msg(sub->sock_fd, &rmt, buf, &rpl, 5000);
    if (r != CUFRAMES_OK) return r;
@@ -88,10 +122,15 @@ static int do_handshake(struct cuframes_subscriber *sub, const char *name) {

    cuframes_msg_hello_resp_t *hresp = (cuframes_msg_hello_resp_t *)buf;
    if (hresp->result != CUFRAMES_OK) return hresp->result;
+    if (hresp->proto_version_actual != CUFRAMES_PROTOCOL_V4) {
+        CUFRAMES_LOG_ERROR("publisher proto v%u — нужен v%u (v0.4)",
+                            hresp->proto_version_actual, CUFRAMES_PROTOCOL_V4);
+        return CUFRAMES_ERR_PROTOCOL;
+    }

    /* Send SUBSCRIBE_REQ */
    uint32_t srbuf[8];
-    srbuf[0] = CUFRAMES_PROTOCOL_V1;
+    srbuf[0] = CUFRAMES_PROTOCOL_V4;
    memset(srbuf + 1, 0, 28);
    r = cuframes_internal_send_msg(sub->sock_fd, CUFRAMES_MSG_SUBSCRIBE_REQ,
                                     srbuf, sizeof(srbuf));
@@ -106,7 +145,29 @@ static int do_handshake(struct cuframes_subscriber *sub, const char *name) {
    if (sresp.result != CUFRAMES_OK) return sresp.result;

    sub->assigned_bit = sresp.assigned_bit;
-    sub->last_seen_seq = sresp.initial_seq;  /* start от текущей точки */
+    sub->last_seen_seq = sresp.initial_seq;
+
+    /* Recv VMM_FDS */
+    cuframes_msg_vmm_fds_t vmm_payload = {0};
+    uint32_t vmm_plen = sizeof(vmm_payload);
+    rmt = 0;
+    r = cuframes_internal_recv_msg_with_fds(sub->sock_fd, &rmt,
+                                              &vmm_payload, &vmm_plen,
+                                              fds_out, fd_count_inout, 5000);
+    if (r != CUFRAMES_OK) {
+        CUFRAMES_LOG_ERROR("recv VMM_FDS: %s", cuframes_strerror(r));
+        return r;
+    }
+    if (rmt != CUFRAMES_MSG_VMM_FDS) {
+        CUFRAMES_LOG_ERROR("expected VMM_FDS got 0x%x", rmt);
+        return CUFRAMES_ERR_PROTOCOL;
+    }
+    if (vmm_payload.fd_count != *fd_count_inout) {
+        CUFRAMES_LOG_ERROR("VMM_FDS: payload fd_count=%u, received %u",
+                            vmm_payload.fd_count, *fd_count_inout);
+        return CUFRAMES_ERR_PROTOCOL;
+    }
+    *slot_size_out = vmm_payload.slot_size_bytes;
    return CUFRAMES_OK;
 }

@@ -123,7 +184,6 @@ int cuframes_subscriber_create(const cuframes_subscriber_config_t *cfg,
    sub->sock_fd = -1;
    sub->shm_fd = -1;

-    /* Generate fallback name if NULL */
    char name_buf[32];
    const char *name = cfg->consumer_name;
    if (!name) {
@@ -132,12 +192,10 @@ int cuframes_subscriber_create(const cuframes_subscriber_config_t *cfg,
        name = name_buf;
    }

-    /* Build paths */
    char sock_path[128];
    int r = cuframes_internal_socket_path(cfg->key, sock_path, sizeof(sock_path));
    if (r != CUFRAMES_OK) { free(sub); return r; }

-    /* Connect with timeout retry */
    int64_t deadline = cfg->connect_timeout_ms > 0
        ? cuframes_now_ns() + (int64_t)cfg->connect_timeout_ms * 1000000LL
        : 0;
@@ -152,63 +210,117 @@ int cuframes_subscriber_create(const cuframes_subscriber_config_t *cfg,
        sub->sock_fd = -1;
        if (cfg->connect_timeout_ms == 0) { r = CUFRAMES_ERR_NOT_FOUND; goto fail; }
        if (deadline && cuframes_now_ns() > deadline) { r = CUFRAMES_ERR_TIMEOUT; goto fail; }
-        struct timespec ts = {.tv_sec = 0, .tv_nsec = 100000000}; /* 100ms */
+        struct timespec ts = {.tv_sec = 0, .tv_nsec = 100000000};
        nanosleep(&ts, NULL);
    }

-    /* Handshake */
-    r = do_handshake(sub, name);
+    /* Handshake (включая VMM_FDS) */
+    int fds[CUFRAMES_MAX_RING];
+    for (int i = 0; i < CUFRAMES_MAX_RING; i++) fds[i] = -1;
+    uint32_t fd_count = CUFRAMES_MAX_RING;
+    uint64_t slot_size = 0;
+    r = do_handshake(sub, name, fds, &fd_count, &slot_size);
    if (r != CUFRAMES_OK) goto fail;

-    /* Open SHM */
+    /* Open SHM (для seq atomics + meta) */
    r = cuframes_internal_shm_name(cfg->key, sub->shm_name, sizeof(sub->shm_name));
-    if (r != CUFRAMES_OK) goto fail;
+    if (r != CUFRAMES_OK) goto fail_close_fds;
    sub->shm_fd = shm_open(sub->shm_name, O_RDWR, 0);
    if (sub->shm_fd < 0) {
        CUFRAMES_LOG_ERROR("shm_open %s: %s", sub->shm_name, strerror(errno));
-        r = CUFRAMES_ERR_IO; goto fail;
+        r = CUFRAMES_ERR_IO; goto fail_close_fds;
    }
    sub->hdr = mmap(NULL, sizeof(cuframes_shm_header_t),
                     PROT_READ | PROT_WRITE, MAP_SHARED, sub->shm_fd, 0);
    if (sub->hdr == MAP_FAILED) {
        sub->hdr = NULL;
-        r = CUFRAMES_ERR_IO; goto fail;
+        r = CUFRAMES_ERR_IO; goto fail_close_fds;
+    }
+    if (sub->hdr->magic != CUFRAMES_MAGIC) {
+        if (sub->hdr->magic == CUFRAMES_MAGIC_LEGACY) {
+            CUFRAMES_LOG_ERROR("publisher uses legacy v0.1-v0.3 SHM — нужен v0.4 publisher");
+        } else {
+            CUFRAMES_LOG_ERROR("SHM magic mismatch: 0x%x", sub->hdr->magic);
+        }
+        r = CUFRAMES_ERR_PROTOCOL; goto fail_close_fds;
+    }
+    if (sub->hdr->proto_version != CUFRAMES_PROTOCOL_V4) {
+        CUFRAMES_LOG_ERROR("SHM proto v%u — нужен v%u",
+                            sub->hdr->proto_version, CUFRAMES_PROTOCOL_V4);
+        r = CUFRAMES_ERR_PROTOCOL; goto fail_close_fds;
    }
-    if (sub->hdr->magic != CUFRAMES_MAGIC) { r = CUFRAMES_ERR_PROTOCOL; goto fail; }

-    /* CUDA setup */
+    /* CUDA driver init + import VMM handles */
+    CUresult cr = cuInit(0);
+    if (cr != CUDA_SUCCESS) {
+        CUFRAMES_LOG_ERROR("cuInit: %s", cu_err_str(cr));
+        r = CUFRAMES_ERR_CUDA; goto fail_close_fds;
+    }
+    /* Ensure a runtime context exists (cudaMemcpyAsync from this pool needs it) */
    cudaError_t cerr = cudaSetDevice(sub->cfg.cuda_device);
    if (cerr != cudaSuccess) {
        CUFRAMES_LOG_ERROR("cudaSetDevice: %s", cudaGetErrorString(cerr));
-        r = CUFRAMES_ERR_CUDA; goto fail;
+        r = CUFRAMES_ERR_CUDA; goto fail_close_fds;
    }

-    /* Open producer's event */
-    cerr = cudaIpcOpenEventHandle(&sub->producer_event, sub->hdr->ipc_event_handle);
-    if (cerr != cudaSuccess) {
-        CUFRAMES_LOG_ERROR("cudaIpcOpenEventHandle: %s", cudaGetErrorString(cerr));
-        r = CUFRAMES_ERR_CUDA; goto fail;
-    }
+    CUmemAccessDesc access = {0};
+    access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+    access.location.id = sub->cfg.cuda_device;
+    access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;

-    /* Open mem handles */
-    int ring = (int)sub->hdr->ring_size;
-    if (ring > CUFRAMES_MAX_RING) ring = CUFRAMES_MAX_RING;
-    for (int i = 0; i < ring; ++i) {
-        cerr = cudaIpcOpenMemHandle(&sub->mapped_ptrs[i],
-                                       sub->hdr->slots[i].mem_handle,
-                                       cudaIpcMemLazyEnablePeerAccess);
-        if (cerr != cudaSuccess) {
-            CUFRAMES_LOG_ERROR("cudaIpcOpenMemHandle slot %d: %s",
-                                i, cudaGetErrorString(cerr));
-            r = CUFRAMES_ERR_CUDA; goto fail;
+    sub->vmm_slot_size = (size_t)slot_size;
+    sub->imported_count = 0;
+    for (uint32_t i = 0; i < fd_count; ++i) {
+        cr = cuMemImportFromShareableHandle(&sub->vmm_handles[i],
+                                             (void *)(uintptr_t)fds[i],
+                                             CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR);
+        if (cr != CUDA_SUCCESS) {
+            CUFRAMES_LOG_ERROR("cuMemImportFromShareableHandle slot %u: %s",
+                                i, cu_err_str(cr));
+            r = CUFRAMES_ERR_CUDA; goto fail_unmap;
        }
+        /* После import можно закрыть FD — kernel держит reference через handle */
+        close(fds[i]);
+        fds[i] = -1;
+
+        cr = cuMemAddressReserve(&sub->vmm_ptrs[i], sub->vmm_slot_size, 0, 0, 0);
+        if (cr != CUDA_SUCCESS) {
+            CUFRAMES_LOG_ERROR("cuMemAddressReserve slot %u: %s",
+                                i, cu_err_str(cr));
+            r = CUFRAMES_ERR_CUDA; goto fail_unmap;
+        }
+        cr = cuMemMap(sub->vmm_ptrs[i], sub->vmm_slot_size, 0,
+                       sub->vmm_handles[i], 0);
+        if (cr != CUDA_SUCCESS) {
+            CUFRAMES_LOG_ERROR("cuMemMap slot %u: %s", i, cu_err_str(cr));
+            r = CUFRAMES_ERR_CUDA; goto fail_unmap;
+        }
+        cr = cuMemSetAccess(sub->vmm_ptrs[i], sub->vmm_slot_size, &access, 1);
+        if (cr != CUDA_SUCCESS) {
+            CUFRAMES_LOG_ERROR("cuMemSetAccess slot %u: %s", i, cu_err_str(cr));
+            r = CUFRAMES_ERR_CUDA; goto fail_unmap;
+        }
+        sub->imported_count++;
    }

-    CUFRAMES_LOG_INFO("subscriber '%s' connected to '%s' (bit=%u, ring=%d)",
-                       name, sub->key, sub->assigned_bit, ring);
+    CUFRAMES_LOG_INFO("subscriber '%s' connected to '%s' (bit=%u, ring=%u, v0.4 VMM)",
+                       name, sub->key, sub->assigned_bit, fd_count);
    *out = sub;
    return CUFRAMES_OK;

+fail_unmap:
+    /* Cleanup partial VMM */
+    for (int i = 0; i < sub->imported_count; i++) {
+        if (sub->vmm_ptrs[i]) {
+            cuMemUnmap(sub->vmm_ptrs[i], sub->vmm_slot_size);
+            cuMemAddressFree(sub->vmm_ptrs[i], sub->vmm_slot_size);
+        }
+        if (sub->vmm_handles[i]) cuMemRelease(sub->vmm_handles[i]);
+    }
+fail_close_fds:
+    for (int i = 0; i < CUFRAMES_MAX_RING; i++) {
+        if (fds[i] >= 0) close(fds[i]);
+    }
 fail:
    cuframes_subscriber_destroy(sub);
    return r;
@@ -224,6 +336,7 @@ int cuframes_subscriber_next(cuframes_subscriber_t *sub,
                              memory_order_acquire) != 0) {
        return CUFRAMES_ERR_DISCONNECTED;
    }
+    (void)consumer_stream;  /* v0.4: producer уже StreamSync'нул, sync не нужен */

    int64_t deadline = (timeout_ms > 0)
        ? cuframes_now_ns() + (int64_t)timeout_ms * 1000000LL
@@ -237,11 +350,9 @@ int cuframes_subscriber_next(cuframes_subscriber_t *sub,
            if (sub->cfg.mode == CUFRAMES_MODE_NEWEST_ONLY) {
                target_seq = gs;
            } else {
-                /* STRICT_ORDER */
                if (sub->last_seen_seq == UINT64_MAX) {
                    target_seq = gs;
                } else if (gs > sub->last_seen_seq + (uint64_t)sub->hdr->ring_size) {
-                    /* Producer overran us. */
                    return CUFRAMES_ERR_DISCONNECTED;
                } else {
                    target_seq = sub->last_seen_seq + 1;
@@ -251,30 +362,22 @@ int cuframes_subscriber_next(cuframes_subscriber_t *sub,
            uint64_t slot_seq = atomic_load_explicit(&sub->hdr->slots[slot_idx].seq,
                                                     memory_order_acquire);
            if (slot_seq != target_seq) {
-                /* Slot уже перезаписан producer'ом — пересчитать */
                continue;
            }
            int64_t pts = atomic_load_explicit(&sub->hdr->slots[slot_idx].pts_ns,
                                                memory_order_acquire);

-            /* Cross-process sync: wait event on consumer's stream */
-            if (consumer_stream) {
-                cudaError_t cerr = cudaStreamWaitEvent((cudaStream_t)consumer_stream,
-                                                         sub->producer_event, 0);
-                if (cerr != cudaSuccess) {
-                    CUFRAMES_LOG_WARN("cudaStreamWaitEvent: %s",
-                                       cudaGetErrorString(cerr));
-                    return CUFRAMES_ERR_CUDA;
-                }
-            } else {
-                /* Synchronize globally — для cudaMemcpyDeviceToHost users */
-                cudaError_t cerr = cudaEventSynchronize(sub->producer_event);
-                if (cerr != cudaSuccess) return CUFRAMES_ERR_CUDA;
+            /* v0.4: producer уже cuStreamSynchronize'нул перед atomic_store seq.
+             * Данные физически в GPU memory к моменту acquire fence. Post-sync
+             * verify оставляем — defending against ring wrap pока мы читали pts. */
+            uint64_t verify_seq = atomic_load_explicit(&sub->hdr->slots[slot_idx].seq,
+                                                       memory_order_acquire);
+            if (verify_seq != target_seq) {
+                continue;
            }

-            /* Fill frame_out */
            struct cuframes_frame *f = &sub->frame_obj;
-            f->cuda_ptr = sub->mapped_ptrs[slot_idx];
+            f->cuda_ptr = (void *)(uintptr_t)sub->vmm_ptrs[slot_idx];
            f->format = (cuframes_format_t)sub->hdr->meta.format;
            f->width = sub->hdr->meta.width;
            f->height = sub->hdr->meta.height;
@@ -290,12 +393,9 @@ int cuframes_subscriber_next(cuframes_subscriber_t *sub,
            return CUFRAMES_OK;
        }

-        /* Не было frame'ов */
        if (timeout_ms == 0) return CUFRAMES_ERR_WOULD_BLOCK;
        if (timeout_ms > 0 && cuframes_now_ns() > deadline) return CUFRAMES_ERR_TIMEOUT;

-        /* Poll-based wait (eventfd — v0.2). 50µs interval — компромисс
-         * latency vs CPU. */
        struct timespec ts = {.tv_sec = 0, .tv_nsec = 50000};
        nanosleep(&ts, NULL);

@@ -311,7 +411,6 @@ int cuframes_subscriber_release(cuframes_subscriber_t *sub,
    if (!frame) return CUFRAMES_OK;
    if (!sub || frame->subscriber != sub) return CUFRAMES_ERR_INVALID_ARG;

-    /* ACK через bitmap */
    if (sub->assigned_bit > 0 && sub->assigned_bit < 64) {
        atomic_fetch_or_explicit(&sub->hdr->slots[frame->slot_idx].ack_bitmap,
                                  1ULL << sub->assigned_bit,
@@ -330,7 +429,6 @@ int cuframes_subscriber_release(cuframes_subscriber_t *sub,
 int cuframes_subscriber_destroy(cuframes_subscriber_t *sub) {
    if (!sub) return CUFRAMES_OK;

-    /* Clear subscriber bit */
    if (sub->hdr && sub->assigned_bit > 0) {
        atomic_fetch_and_explicit(&sub->hdr->subscriber_bitmap,
                                   ~(1ULL << sub->assigned_bit),
@@ -339,12 +437,21 @@ int cuframes_subscriber_destroy(cuframes_subscriber_t *sub) {
                              0, memory_order_release);
    }

-    if (sub->producer_event) cudaEventDestroy(sub->producer_event);
+    /* VMM cleanup */
+    for (int i = 0; i < sub->imported_count; i++) {
+        if (sub->vmm_ptrs[i]) {
+            cuMemUnmap(sub->vmm_ptrs[i], sub->vmm_slot_size);
+            cuMemAddressFree(sub->vmm_ptrs[i], sub->vmm_slot_size);
+        }
+        if (sub->vmm_handles[i]) cuMemRelease(sub->vmm_handles[i]);
+    }

-    int ring = sub->hdr ? (int)sub->hdr->ring_size : 0;
-    if (ring > CUFRAMES_MAX_RING) ring = CUFRAMES_MAX_RING;
-    for (int i = 0; i < ring; ++i) {
-        if (sub->mapped_ptrs[i]) cudaIpcCloseMemHandle(sub->mapped_ptrs[i]);
+    if (sub->has_pkt_ring) {
+        cuframes_internal_pkt_ring_destroy(&sub->pkt_ring);
+    }
+    if (sub->packet_obj.data) {
+        free(sub->packet_obj.data);
+        sub->packet_obj.data = NULL;
    }

    if (sub->hdr) munmap(sub->hdr, sizeof(cuframes_shm_header_t));
@@ -353,3 +460,120 @@ int cuframes_subscriber_destroy(cuframes_subscriber_t *sub) {
    free(sub);
    return CUFRAMES_OK;
 }
+
+/* ─────────────────────────────────────────────────────────────────────── */
+/* v0.2 — encoded packet ring API (см. docs/protocol.md §10)                */
+/* ─────────────────────────────────────────────────────────────────────── */
+
+const void *cuframes_packet_data(const cuframes_packet_t *p)  { return p ? p->data : NULL; }
+size_t      cuframes_packet_size(const cuframes_packet_t *p)  { return p ? p->size : 0; }
+int64_t     cuframes_packet_pts(const cuframes_packet_t *p)   { return p ? p->pts_ns : 0; }
+int64_t     cuframes_packet_dts(const cuframes_packet_t *p)   { return p ? p->dts_ns : 0; }
+uint32_t    cuframes_packet_flags(const cuframes_packet_t *p) { return p ? p->flags : 0; }
+uint64_t    cuframes_packet_seq(const cuframes_packet_t *p)   { return p ? p->seq : 0; }
+
+int cuframes_subscriber_enable_packets(cuframes_subscriber_t *sub) {
+    if (!sub) return CUFRAMES_ERR_INVALID_ARG;
+    if (sub->has_pkt_ring) return CUFRAMES_OK;
+
+    char pkt_name[128];
+    int r = cuframes_internal_pkt_shm_name(sub->key, pkt_name, sizeof(pkt_name));
+    if (r != CUFRAMES_OK) return r;
+
+    r = cuframes_internal_pkt_ring_open(pkt_name, &sub->pkt_ring);
+    if (r != CUFRAMES_OK) return r;
+
+    size_t capacity = sub->pkt_ring.hdr->data_size;
+    sub->packet_obj.data = (uint8_t *)malloc(capacity);
+    if (!sub->packet_obj.data) {
+        cuframes_internal_pkt_ring_destroy(&sub->pkt_ring);
+        return CUFRAMES_ERR_OUT_OF_MEMORY;
+    }
+    sub->packet_obj.capacity = capacity;
+
+    uint64_t kf = atomic_load_explicit(&sub->pkt_ring.hdr->last_keyframe_seq,
+                                        memory_order_acquire);
+    sub->last_packet_seq = (kf == UINT64_MAX) ? UINT64_MAX : kf - 1;
+    sub->has_pkt_ring = 1;
+    return CUFRAMES_OK;
+}
+
+int cuframes_subscriber_next_packet(cuframes_subscriber_t *sub,
+                                     cuframes_packet_t **pkt_out,
+                                     int32_t timeout_ms) {
+    if (!sub || !pkt_out) return CUFRAMES_ERR_INVALID_ARG;
+    if (!sub->has_pkt_ring) return CUFRAMES_ERR_NO_PACKET_RING;
+    if (sub->packet_busy) return CUFRAMES_ERR_INVALID_ARG;
+
+    int64_t deadline_ns = (timeout_ms > 0) ?
+        cuframes_now_ns() + (int64_t)timeout_ms * 1000000LL : 0;
+
+    for (;;) {
+        size_t size = 0;
+        int64_t pts = 0, dts = 0;
+        uint32_t flags = 0;
+        uint64_t seq_attempt = sub->last_packet_seq;
+
+        int r = cuframes_internal_pkt_ring_read(&sub->pkt_ring,
+                                                 &seq_attempt,
+                                                 sub->packet_obj.data,
+                                                 sub->packet_obj.capacity,
+                                                 &size, &pts, &dts, &flags);
+        if (r == CUFRAMES_OK) {
+            sub->last_packet_seq      = seq_attempt;
+            sub->packet_obj.size      = size;
+            sub->packet_obj.pts_ns    = pts;
+            sub->packet_obj.dts_ns    = dts;
+            sub->packet_obj.flags     = flags;
+            sub->packet_obj.seq       = seq_attempt;
+            sub->packet_busy = 1;
+            *pkt_out = &sub->packet_obj;
+            return CUFRAMES_OK;
+        }
+
+        if (r == CUFRAMES_ERR_PACKET_OVERRUN) {
+            uint64_t kf = atomic_load_explicit(
+                &sub->pkt_ring.hdr->last_keyframe_seq, memory_order_acquire);
+            if (kf != UINT64_MAX) {
+                sub->last_packet_seq = kf - 1;
+            }
+            *pkt_out = NULL;
+            return CUFRAMES_ERR_PACKET_OVERRUN;
+        }
+
+        if (r != CUFRAMES_ERR_TIMEOUT) {
+            *pkt_out = NULL;
+            return r;
+        }
+
+        if (timeout_ms == 0) return CUFRAMES_ERR_WOULD_BLOCK;
+        if (timeout_ms > 0 && cuframes_now_ns() >= deadline_ns) {
+            return CUFRAMES_ERR_TIMEOUT;
+        }
+        struct timespec ts = {0, 1 * 1000 * 1000};
+        nanosleep(&ts, NULL);
+    }
+}
+
+int cuframes_subscriber_release_packet(cuframes_subscriber_t *sub,
+                                        cuframes_packet_t *pkt) {
+    if (!sub) return CUFRAMES_ERR_INVALID_ARG;
+    if (!pkt) return CUFRAMES_OK;
+    if (pkt != &sub->packet_obj) return CUFRAMES_ERR_INVALID_ARG;
+    sub->packet_busy = 0;
+    return CUFRAMES_OK;
+}
+
+int cuframes_subscriber_get_codec_params(cuframes_subscriber_t *sub,
+                                          uint32_t *codec_id_out,
+                                          const void **extradata_out,
+                                          size_t *extradata_size_out) {
+    if (!sub) return CUFRAMES_ERR_INVALID_ARG;
+    if (!sub->has_pkt_ring) return CUFRAMES_ERR_NO_PACKET_RING;
+    cuframes_pkt_header_t *hdr = sub->pkt_ring.hdr;
+    if (codec_id_out)       *codec_id_out = hdr->codec_id;
+    if (extradata_out)      *extradata_out = hdr->codec_extradata;
+    if (extradata_size_out) *extradata_size_out = hdr->codec_extradata_size;
+    if (hdr->codec_extradata_size == 0) return CUFRAMES_ERR_NO_CODEC_PARAMS;
+    return CUFRAMES_OK;
+}
@@ -8,6 +8,7 @@
 #define CUFRAMES_INTERNAL_H

 #define _GNU_SOURCE
+#include <cuda.h>          /* v0.4 — driver API: cuMemCreate/cuMemMap/cuMemExportToShareableHandle */
 #include <cuda_runtime.h>
 #include <pthread.h>
 #include <stdatomic.h>
@@ -21,14 +22,33 @@

 /* ─── Protocol constants ──────────────────────────────────────────────── */

-#define CUFRAMES_MAGIC          0xCC7C1DCCu
+#define CUFRAMES_MAGIC          0xCC7C1DCEu         /* v0.4 — bumped с 0xCC7C1DCC (full ABI break) */
+#define CUFRAMES_MAGIC_LEGACY   0xCC7C1DCCu         /* v0.1—v0.3 magic; ловится consumer'ом как clean PROTOCOL error */
 #define CUFRAMES_PROTOCOL_V1    1u
+#define CUFRAMES_PROTOCOL_V2    2u                  /* v0.2 — packet ring support */
+#define CUFRAMES_PROTOCOL_V3    3u                  /* v0.3 — per-slot CUDA events (deprecated; не работает без pid share) */
+#define CUFRAMES_PROTOCOL_V4    4u                  /* v0.4 — VMM + POSIX FD: pid/ipc namespace share не требуется */
 #define CUFRAMES_MAX_SUBSCRIBERS 32
 #define CUFRAMES_MAX_RING       16
 #define CUFRAMES_MAX_KEY_LEN    63
 #define CUFRAMES_MAX_NAME_LEN   31
 #define CUFRAMES_RUNTIME_DIR    "/run/cuframes"
 #define CUFRAMES_SHM_PREFIX     "/cuframes-"
+#define CUFRAMES_PKT_SHM_SUFFIX "-packets"          /* /cuframes-<key>-packets */
+
+/* Packet ring constants (см. docs/protocol.md §10) */
+#define CUFRAMES_PKT_MAGIC              0xCC7C1DCDu /* frames magic + 1 */
+#define CUFRAMES_PKT_EXTRADATA_MAX      4096u
+#define CUFRAMES_PKT_DEFAULT_SLOTS      64u
+#define CUFRAMES_PKT_DEFAULT_DATA_SIZE  (8u * 1024u * 1024u)   /* 8 MB */
+#define CUFRAMES_PKT_DEFAULT_MAX_SIZE   (2u * 1024u * 1024u)   /* 2 MB */
+#define CUFRAMES_PKT_MAX_SLOTS          1024u
+
+/* Packet flags (см. docs/protocol.md §10.6) */
+#define CUFRAMES_PKT_FLAG_KEY           0x01u
+#define CUFRAMES_PKT_FLAG_CORRUPT       0x02u
+#define CUFRAMES_PKT_FLAG_DISCONTINUITY 0x04u
+#define CUFRAMES_PKT_FLAG_LAST_IN_AU    0x08u

 /* ─── Shared memory layout (см. docs/protocol.md §2) ──────────────────── */

@@ -91,6 +111,11 @@ typedef struct __attribute__((packed)) cuframes_shm_header {
    /* offset 0x100 — variable-length tail */
    cuframes_shm_slot_t slots[CUFRAMES_MAX_RING];                /* 192 × 16 = 3072 */
    cuframes_shm_subscriber_t subscribers[CUFRAMES_MAX_SUBSCRIBERS]; /* 128 × 32 = 4096 */
+    /* v0.3 — per-slot CUDA event handles. Producer records event per publish;
+     * consumer waits event[slot_idx] specifically (не global ipc_event_handle
+     * который signals только для последнего published frame). Закрывает TOCTOU
+     * race в slot read. 64 × 16 = 1024 bytes. */
+    cudaIpcEventHandle_t slot_event_handles[CUFRAMES_MAX_RING];
 } cuframes_shm_header_t;

 /* Layout sanity checks (docs/protocol.md §2 table) */
@@ -103,6 +128,73 @@ _Static_assert(offsetof(cuframes_shm_header_t, ipc_event_handle) == 0x0080, "eve
 _Static_assert(offsetof(cuframes_shm_header_t, global_seq) == 0x00C0, "global_seq offset");
 _Static_assert(offsetof(cuframes_shm_header_t, slots) == 0x0100, "slots offset");

+/* ─── Packet ring shared memory layout (docs/protocol.md §10) ──────────── */
+
+/* Packet slot entry — packed 64 байт */
+typedef struct __attribute__((packed)) cuframes_pkt_slot {
+    _Atomic uint64_t seq;          /* UINT64_MAX = invalid */
+    int64_t          pts_ns;
+    int64_t          dts_ns;
+    uint64_t         data_offset;  /* absolute byte cursor; % data_size = ring offset */
+    uint32_t         data_size;
+    uint32_t         flags;
+    uint8_t          reserved[24];
+} cuframes_pkt_slot_t;
+_Static_assert(sizeof(cuframes_pkt_slot_t) == 64, "packet slot must be 64 bytes");
+
+/* Packet ring header (fixed 0x1040 = 4160 bytes). Followed by slots[N] + data[]. */
+typedef struct __attribute__((packed)) cuframes_pkt_header {
+    uint32_t magic;                     /* CUFRAMES_PKT_MAGIC */
+    uint32_t proto_version;             /* 2 */
+    uint32_t ring_slots;
+    uint32_t data_size;
+    uint32_t codec_id;                  /* AV_CODEC_ID_H264 / HEVC / ... */
+    uint32_t codec_extradata_size;      /* ≤ CUFRAMES_PKT_EXTRADATA_MAX */
+    uint64_t producer_pid;
+    _Atomic uint64_t global_seq;
+    _Atomic uint64_t last_keyframe_seq;
+    _Atomic uint64_t write_offset;
+    _Atomic uint64_t shutdown_flag;
+    uint8_t  codec_extradata[CUFRAMES_PKT_EXTRADATA_MAX];
+    /* offset 0x1040 — slots[ring_slots], then data[data_size] */
+} cuframes_pkt_header_t;
+
+_Static_assert(offsetof(cuframes_pkt_header_t, magic) == 0x0000, "pkt magic offset");
+_Static_assert(offsetof(cuframes_pkt_header_t, proto_version) == 0x0004, "pkt proto offset");
+_Static_assert(offsetof(cuframes_pkt_header_t, producer_pid) == 0x0018, "pkt pid offset");
+_Static_assert(offsetof(cuframes_pkt_header_t, global_seq) == 0x0020, "pkt global_seq offset");
+_Static_assert(offsetof(cuframes_pkt_header_t, write_offset) == 0x0030, "pkt write_offset offset");
+_Static_assert(offsetof(cuframes_pkt_header_t, codec_extradata) == 0x0040, "pkt extradata offset");
+_Static_assert(sizeof(cuframes_pkt_header_t) == 0x1040, "pkt header must be 0x1040 bytes");
+
+/* Computed SHM layout helper:
+ *   total = sizeof(cuframes_pkt_header_t) + slots*sizeof(slot) + data_size
+ */
+static inline size_t cuframes_pkt_shm_size(uint32_t slots, uint32_t data_size) {
+    return sizeof(cuframes_pkt_header_t)
+         + (size_t)slots * sizeof(cuframes_pkt_slot_t)
+         + (size_t)data_size;
+}
+
+/* Pointers into mmap'ed pkt SHM (computed from header base) */
+static inline cuframes_pkt_slot_t * cuframes_pkt_slots(cuframes_pkt_header_t *hdr) {
+    return (cuframes_pkt_slot_t *)((uint8_t *)hdr + sizeof(cuframes_pkt_header_t));
+}
+static inline uint8_t * cuframes_pkt_data(cuframes_pkt_header_t *hdr) {
+    return (uint8_t *)hdr + sizeof(cuframes_pkt_header_t)
+         + (size_t)hdr->ring_slots * sizeof(cuframes_pkt_slot_t);
+}
+
+/* Opaque ring handle — содержит state и mapping для publisher или subscriber. */
+typedef struct cuframes_pkt_ring {
+    int    shm_fd;
+    void  *shm_base;
+    size_t shm_size;
+    cuframes_pkt_header_t *hdr;
+    char   shm_name[128];   /* /cuframes-<key>-packets */
+    int    is_publisher;
+} cuframes_pkt_ring_t;
+
 /* ─── Socket protocol messages (docs/protocol.md §3) ───────────────────── */

 #define CUFRAMES_MSG_HELLO_REQ      0x01
@@ -115,6 +207,10 @@ _Static_assert(offsetof(cuframes_shm_header_t, slots) == 0x0100, "slots offset")
 #define CUFRAMES_MSG_PING           0xF0
 #define CUFRAMES_MSG_PONG           0xF1
 #define CUFRAMES_MSG_ERROR          0xFE
+/* v0.4: после SUBSCRIBE_RESP publisher шлёт VMM_FDS с N posix FD handles в
+ * SCM_RIGHTS control. Payload: uint64_t slot_size + uint32_t fd_count +
+ * uint32_t reserved (для alignment). FDs приходят отдельным контрол-блоком. */
+#define CUFRAMES_MSG_VMM_FDS        0x05

 #define CUFRAMES_MAX_MSG_PAYLOAD 4096

@@ -148,6 +244,14 @@ typedef struct __attribute__((packed)) cuframes_msg_subscribe_resp {
    uint8_t  reserved[12];
 } cuframes_msg_subscribe_resp_t;

+/* v0.4: payload VMM_FDS message. Сами FDs идут в SCM_RIGHTS control-msg
+ * (см. cuframes_internal_send_msg_with_fds). */
+typedef struct __attribute__((packed)) cuframes_msg_vmm_fds {
+    uint64_t slot_size_bytes;   /* физический размер одного slot после round-up к granularity */
+    uint32_t fd_count;          /* должно совпадать с ring_size */
+    uint32_t reserved;
+} cuframes_msg_vmm_fds_t;
+
 /* ─── Logging (minimal — to stderr) ────────────────────────────────────── */

 #define CUFRAMES_LOG_ERROR(fmt, ...) \
@@ -164,6 +268,8 @@ typedef struct __attribute__((packed)) cuframes_msg_subscribe_resp {
 int cuframes_internal_socket_path(const char *key, char *out, size_t out_size);
 /* Build /cuframes-<key> (for shm_open) */
 int cuframes_internal_shm_name(const char *key, char *out, size_t out_size);
+/* Build /cuframes-<key>-packets (for shm_open) */
+int cuframes_internal_pkt_shm_name(const char *key, char *out, size_t out_size);
 /* Validate key per protocol.md (alphanum/_/-, 1..63 chars) */
 int cuframes_internal_validate_key(const char *key);
 /* Calculate frame size + pitch для format/W/H */
@@ -181,4 +287,57 @@ int cuframes_internal_recv_msg(int sock_fd, uint32_t *msg_type_out,
                                void *payload, uint32_t *payload_len_inout,
                                int32_t timeout_ms);

+/* v0.4 — send/recv с FD-attached. Используется только для VMM_FDS message. */
+int cuframes_internal_send_msg_with_fds(int sock_fd, uint32_t msg_type,
+                                         const void *payload, uint32_t payload_len,
+                                         const int *fds, uint32_t fd_count);
+int cuframes_internal_recv_msg_with_fds(int sock_fd, uint32_t *msg_type_out,
+                                         void *payload, uint32_t *payload_len_inout,
+                                         int *fds_out, uint32_t *fd_count_inout,
+                                         int32_t timeout_ms);
+
+/* ─── Packet ring helpers (libcuframes/src/packet_ring.c) ─────────────── */
+
+/* Publisher: create SHM + initialize header + slots. Stale recovery как у frames. */
+int cuframes_internal_pkt_ring_create(const char *key,
+                                       uint32_t slots,
+                                       uint32_t data_size,
+                                       uint32_t codec_id,
+                                       cuframes_pkt_ring_t *ring_out);
+
+/* Publisher: set codec extradata (SPS/PPS). Must be called before first publish.
+ * Если size > CUFRAMES_PKT_EXTRADATA_MAX → ERR_INVALID_ARG. */
+int cuframes_internal_pkt_ring_set_extradata(cuframes_pkt_ring_t *ring,
+                                              const void *extradata,
+                                              size_t size);
+
+/* Publisher: publish single encoded packet. Slow consumer = overwrite oldest.
+ * Returns CUFRAMES_ERR_PACKET_OVERSIZED если size > data_size. */
+int cuframes_internal_pkt_ring_publish(cuframes_pkt_ring_t *ring,
+                                        const void *data, size_t size,
+                                        int64_t pts_ns, int64_t dts_ns,
+                                        uint32_t flags);
+
+/* Subscriber: open existing SHM by shm name (from HELLO_RESP packet_shm_path). */
+int cuframes_internal_pkt_ring_open(const char *shm_name,
+                                     cuframes_pkt_ring_t *ring_out);
+
+/* Subscriber: read next packet.
+ *   *seq_inout — currently held seq (we read seq_inout+1); updated on success.
+ *   out_buf must have ≥ max_packet_size bytes; out_size receives actual size.
+ * Returns:
+ *   CUFRAMES_OK on success
+ *   CUFRAMES_ERR_PACKET_OVERRUN если publisher уехал — caller resync on keyframe
+ *   CUFRAMES_ERR_TIMEOUT если нет нового packet
+ *   CUFRAMES_ERR_DISCONNECTED если publisher shutdown */
+int cuframes_internal_pkt_ring_read(cuframes_pkt_ring_t *ring,
+                                     uint64_t *seq_inout,
+                                     void *out_buf, size_t out_buf_max,
+                                     size_t *out_size,
+                                     int64_t *out_pts, int64_t *out_dts,
+                                     uint32_t *out_flags);
+
+/* Publisher OR Subscriber: cleanup mmap + close FD. Publisher additionally shm_unlink. */
+void cuframes_internal_pkt_ring_destroy(cuframes_pkt_ring_t *ring);
+
 #endif /* CUFRAMES_INTERNAL_H */
@@ -0,0 +1,380 @@
+/* libcuframes/src/packet_ring.c
+ *
+ * Variable-length encoded packet ring buffer (docs/protocol.md §10).
+ *
+ * Использует POSIX shared memory (`/cuframes-<key>-packets`), packed
+ * structures с _Atomic полями, seqlock-style read для защиты от overrun
+ * mid-read.
+ *
+ * Этот модуль внутренний — exposed API будет в Step 3 (cuframes.h
+ * extension). Сейчас functions имеют prefix `cuframes_internal_pkt_ring_*`
+ * и используются из producer.c / consumer.c.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "internal.h"
+
+/* ─── Internal helpers ────────────────────────────────────────────────── */
+
+static void wraparound_memcpy(uint8_t *dst, const uint8_t *src, size_t n,
+                              size_t buf_size, size_t offset) {
+    /* Запись n байт начиная с offset в buf размера buf_size, wraparound. */
+    size_t off = offset % buf_size;
+    size_t first = n;
+    if (first > buf_size - off) first = buf_size - off;
+    memcpy(dst + off, src, first);
+    if (first < n) {
+        memcpy(dst, src + first, n - first);
+    }
+}
+
+static void wraparound_memcpy_from(uint8_t *out, const uint8_t *buf,
+                                   size_t buf_size, size_t offset, size_t n) {
+    /* Чтение n байт из buf с wraparound от offset. */
+    size_t off = offset % buf_size;
+    size_t first = n;
+    if (first > buf_size - off) first = buf_size - off;
+    memcpy(out, buf + off, first);
+    if (first < n) {
+        memcpy(out + first, buf, n - first);
+    }
+}
+
+/* ─── Publisher API ───────────────────────────────────────────────────── */
+
+int cuframes_internal_pkt_ring_create(const char *key,
+                                       uint32_t slots,
+                                       uint32_t data_size,
+                                       uint32_t codec_id,
+                                       cuframes_pkt_ring_t *ring_out) {
+    if (!ring_out) return CUFRAMES_ERR_INVALID_ARG;
+    if (slots == 0 || slots > CUFRAMES_PKT_MAX_SLOTS) return CUFRAMES_ERR_INVALID_ARG;
+    if (data_size == 0) return CUFRAMES_ERR_INVALID_ARG;
+
+    memset(ring_out, 0, sizeof(*ring_out));
+    ring_out->shm_fd = -1;
+    ring_out->is_publisher = 1;
+
+    int r = cuframes_internal_pkt_shm_name(key, ring_out->shm_name,
+                                            sizeof(ring_out->shm_name));
+    if (r != CUFRAMES_OK) return r;
+
+    /* Stale recovery (как в frames SHM) */
+    int fd = shm_open(ring_out->shm_name, O_CREAT | O_EXCL | O_RDWR, 0644);
+    if (fd < 0) {
+        if (errno == EEXIST) {
+            int existing = shm_open(ring_out->shm_name, O_RDWR, 0);
+            if (existing >= 0) {
+                cuframes_pkt_header_t tmp;
+                ssize_t rb = read(existing, &tmp, sizeof(tmp));
+                close(existing);
+                if (rb == (ssize_t)sizeof(tmp) && tmp.magic == CUFRAMES_PKT_MAGIC) {
+                    if (cuframes_internal_pid_alive((pid_t)tmp.producer_pid)) {
+                        CUFRAMES_LOG_ERROR("packet ring %s: publisher pid %lu still alive",
+                                           ring_out->shm_name,
+                                           (unsigned long)tmp.producer_pid);
+                        return CUFRAMES_ERR_ALREADY_EXISTS;
+                    }
+                }
+            }
+            CUFRAMES_LOG_INFO("stale packet shm %s — unlinking", ring_out->shm_name);
+            shm_unlink(ring_out->shm_name);
+            fd = shm_open(ring_out->shm_name, O_CREAT | O_EXCL | O_RDWR, 0644);
+            if (fd < 0) {
+                CUFRAMES_LOG_ERROR("packet shm_open after unlink: %s", strerror(errno));
+                return CUFRAMES_ERR_IO;
+            }
+        } else {
+            CUFRAMES_LOG_ERROR("packet shm_open: %s", strerror(errno));
+            return CUFRAMES_ERR_IO;
+        }
+    }
+
+    size_t total_size = cuframes_pkt_shm_size(slots, data_size);
+    if (ftruncate(fd, (off_t)total_size) < 0) {
+        CUFRAMES_LOG_ERROR("packet ftruncate(%zu): %s", total_size, strerror(errno));
+        close(fd);
+        shm_unlink(ring_out->shm_name);
+        return CUFRAMES_ERR_IO;
+    }
+
+    void *base = mmap(NULL, total_size, PROT_READ | PROT_WRITE,
+                      MAP_SHARED, fd, 0);
+    if (base == MAP_FAILED) {
+        CUFRAMES_LOG_ERROR("packet mmap: %s", strerror(errno));
+        close(fd);
+        shm_unlink(ring_out->shm_name);
+        return CUFRAMES_ERR_IO;
+    }
+
+    ring_out->shm_fd = fd;
+    ring_out->shm_base = base;
+    ring_out->shm_size = total_size;
+    ring_out->hdr = (cuframes_pkt_header_t *)base;
+
+    /* Initialize header — нули + magic/version/sizes */
+    memset(ring_out->hdr, 0, sizeof(*ring_out->hdr));
+    ring_out->hdr->magic = CUFRAMES_PKT_MAGIC;
+    ring_out->hdr->proto_version = CUFRAMES_PROTOCOL_V2;
+    ring_out->hdr->ring_slots = slots;
+    ring_out->hdr->data_size = data_size;
+    ring_out->hdr->codec_id = codec_id;
+    ring_out->hdr->codec_extradata_size = 0;
+    ring_out->hdr->producer_pid = (uint64_t)getpid();
+    atomic_store_explicit(&ring_out->hdr->global_seq, UINT64_MAX,
+                          memory_order_release);
+    atomic_store_explicit(&ring_out->hdr->last_keyframe_seq, UINT64_MAX,
+                          memory_order_release);
+    atomic_store_explicit(&ring_out->hdr->write_offset, 0,
+                          memory_order_release);
+    atomic_store_explicit(&ring_out->hdr->shutdown_flag, 0,
+                          memory_order_release);
+
+    /* Initialize slots — invalid seq markers */
+    cuframes_pkt_slot_t *slots_arr = cuframes_pkt_slots(ring_out->hdr);
+    for (uint32_t i = 0; i < slots; ++i) {
+        atomic_store_explicit(&slots_arr[i].seq, UINT64_MAX,
+                              memory_order_release);
+    }
+
+    /* Data section уже zeroed через ftruncate (POSIX guarantees) */
+
+    CUFRAMES_LOG_INFO("packet ring %s: slots=%u data_size=%u codec_id=%u (total=%zu bytes)",
+                      ring_out->shm_name, slots, data_size, codec_id, total_size);
+    return CUFRAMES_OK;
+}
+
+int cuframes_internal_pkt_ring_set_extradata(cuframes_pkt_ring_t *ring,
+                                              const void *extradata,
+                                              size_t size) {
+    if (!ring || !ring->hdr) return CUFRAMES_ERR_INVALID_ARG;
+    if (!ring->is_publisher) return CUFRAMES_ERR_INVALID_ARG;
+    if (size > CUFRAMES_PKT_EXTRADATA_MAX) return CUFRAMES_ERR_INVALID_ARG;
+    if (size > 0 && !extradata) return CUFRAMES_ERR_INVALID_ARG;
+
+    /* Записываем сначала bytes, потом size (release-style — subscriber видит size>0 только когда extradata готов). */
+    if (size > 0) {
+        memcpy(ring->hdr->codec_extradata, extradata, size);
+        /* Memory barrier — extradata stores complete до size update. */
+        __atomic_thread_fence(__ATOMIC_RELEASE);
+    }
+    ring->hdr->codec_extradata_size = (uint32_t)size;
+    return CUFRAMES_OK;
+}
+
+int cuframes_internal_pkt_ring_publish(cuframes_pkt_ring_t *ring,
+                                        const void *data, size_t size,
+                                        int64_t pts_ns, int64_t dts_ns,
+                                        uint32_t flags) {
+    if (!ring || !ring->hdr) return CUFRAMES_ERR_INVALID_ARG;
+    if (!ring->is_publisher) return CUFRAMES_ERR_INVALID_ARG;
+    if (size == 0 || !data) return CUFRAMES_ERR_INVALID_ARG;
+    if (size > ring->hdr->data_size) return CUFRAMES_ERR_PACKET_OVERSIZED;
+
+    cuframes_pkt_header_t *hdr = ring->hdr;
+
+    /* Allocate next seq + cursor offset. Single-publisher — без CAS. */
+    uint64_t prev_seq = atomic_load_explicit(&hdr->global_seq,
+                                             memory_order_relaxed);
+    uint64_t new_seq = (prev_seq == UINT64_MAX) ? 0 : prev_seq + 1;
+
+    uint64_t write_off = atomic_load_explicit(&hdr->write_offset,
+                                              memory_order_relaxed);
+
+    /* Записать payload в data ring (wraparound aware) */
+    wraparound_memcpy(cuframes_pkt_data(hdr), data, size,
+                      hdr->data_size, write_off);
+
+    /* Записать slot metadata. Slot index = seq % ring_slots. */
+    uint32_t slot_idx = (uint32_t)(new_seq % hdr->ring_slots);
+    cuframes_pkt_slot_t *slot = &cuframes_pkt_slots(hdr)[slot_idx];
+
+    slot->pts_ns = pts_ns;
+    slot->dts_ns = dts_ns;
+    slot->data_offset = write_off;
+    slot->data_size = (uint32_t)size;
+    slot->flags = flags;
+
+    /* RELEASE order — payload bytes + slot metadata готовы перед publish seq. */
+    atomic_store_explicit(&slot->seq, new_seq, memory_order_release);
+
+    /* Update global cursor + global_seq. */
+    atomic_store_explicit(&hdr->write_offset, write_off + size,
+                          memory_order_release);
+    atomic_store_explicit(&hdr->global_seq, new_seq,
+                          memory_order_release);
+
+    /* Keyframe — update last_keyframe_seq для late subscribers. */
+    if (flags & CUFRAMES_PKT_FLAG_KEY) {
+        atomic_store_explicit(&hdr->last_keyframe_seq, new_seq,
+                              memory_order_release);
+    }
+
+    return CUFRAMES_OK;
+}
+
+/* ─── Subscriber API ──────────────────────────────────────────────────── */
+
+int cuframes_internal_pkt_ring_open(const char *shm_name,
+                                     cuframes_pkt_ring_t *ring_out) {
+    if (!shm_name || !ring_out) return CUFRAMES_ERR_INVALID_ARG;
+
+    memset(ring_out, 0, sizeof(*ring_out));
+    ring_out->shm_fd = -1;
+    ring_out->is_publisher = 0;
+    strncpy(ring_out->shm_name, shm_name, sizeof(ring_out->shm_name) - 1);
+
+    int fd = shm_open(shm_name, O_RDONLY, 0);
+    if (fd < 0) {
+        if (errno == ENOENT) return CUFRAMES_ERR_NOT_FOUND;
+        CUFRAMES_LOG_ERROR("packet shm_open(%s) ro: %s", shm_name, strerror(errno));
+        return CUFRAMES_ERR_IO;
+    }
+
+    /* Прочитать header чтобы узнать total size */
+    cuframes_pkt_header_t header_peek;
+    ssize_t rb = read(fd, &header_peek, sizeof(header_peek));
+    if (rb != (ssize_t)sizeof(header_peek)) {
+        close(fd);
+        return CUFRAMES_ERR_IO;
+    }
+    if (header_peek.magic != CUFRAMES_PKT_MAGIC) {
+        CUFRAMES_LOG_ERROR("packet shm %s: bad magic 0x%08x", shm_name, header_peek.magic);
+        close(fd);
+        return CUFRAMES_ERR_PROTOCOL;
+    }
+    if (header_peek.proto_version != CUFRAMES_PROTOCOL_V2) {
+        CUFRAMES_LOG_ERROR("packet shm %s: proto_version=%u (expected %u)",
+                           shm_name, header_peek.proto_version, CUFRAMES_PROTOCOL_V2);
+        close(fd);
+        return CUFRAMES_ERR_PROTOCOL;
+    }
+
+    size_t total = cuframes_pkt_shm_size(header_peek.ring_slots,
+                                          header_peek.data_size);
+
+    /* mmap полностью read-only */
+    void *base = mmap(NULL, total, PROT_READ, MAP_SHARED, fd, 0);
+    if (base == MAP_FAILED) {
+        CUFRAMES_LOG_ERROR("packet mmap ro: %s", strerror(errno));
+        close(fd);
+        return CUFRAMES_ERR_IO;
+    }
+
+    ring_out->shm_fd = fd;
+    ring_out->shm_base = base;
+    ring_out->shm_size = total;
+    ring_out->hdr = (cuframes_pkt_header_t *)base;
+
+    CUFRAMES_LOG_INFO("packet ring %s opened: slots=%u data_size=%u",
+                      shm_name, header_peek.ring_slots, header_peek.data_size);
+    return CUFRAMES_OK;
+}
+
+int cuframes_internal_pkt_ring_read(cuframes_pkt_ring_t *ring,
+                                     uint64_t *seq_inout,
+                                     void *out_buf, size_t out_buf_max,
+                                     size_t *out_size,
+                                     int64_t *out_pts, int64_t *out_dts,
+                                     uint32_t *out_flags) {
+    if (!ring || !ring->hdr || !seq_inout || !out_buf || !out_size
+        || !out_pts || !out_dts || !out_flags) {
+        return CUFRAMES_ERR_INVALID_ARG;
+    }
+
+    cuframes_pkt_header_t *hdr = ring->hdr;
+
+    /* Publisher shutdown? */
+    if (atomic_load_explicit(&hdr->shutdown_flag, memory_order_acquire) != 0) {
+        return CUFRAMES_ERR_DISCONNECTED;
+    }
+
+    /* Текущий published seq */
+    uint64_t cur = atomic_load_explicit(&hdr->global_seq, memory_order_acquire);
+    if (cur == UINT64_MAX) return CUFRAMES_ERR_TIMEOUT; /* нет published */
+    if (*seq_inout != UINT64_MAX && cur <= *seq_inout) {
+        return CUFRAMES_ERR_TIMEOUT;
+    }
+
+    /* Calculate the next seq we want (handle первый read с UINT64_MAX → start с 0) */
+    uint64_t want_seq = (*seq_inout == UINT64_MAX) ? 0 : (*seq_inout + 1);
+
+    /* Если want_seq < cur и slot уже перезаписан — попадаем в OVERRUN */
+    if (cur - want_seq >= hdr->ring_slots) {
+        /* Скорее всего slot уже rewritten. Подсказка caller'у — resync. */
+        return CUFRAMES_ERR_PACKET_OVERRUN;
+    }
+
+    uint32_t slot_idx = (uint32_t)(want_seq % hdr->ring_slots);
+    cuframes_pkt_slot_t *slot = &cuframes_pkt_slots(hdr)[slot_idx];
+
+    /* Seqlock-style read: load seq, prove not overwritten после copy. */
+    uint64_t s1 = atomic_load_explicit(&slot->seq, memory_order_acquire);
+    if (s1 != want_seq) {
+        /* Slot уже занят следующим packet'ом — overrun. */
+        return CUFRAMES_ERR_PACKET_OVERRUN;
+    }
+
+    /* Снять metadata (non-atomic — read OK поскольку post-check защищает) */
+    uint64_t data_off = slot->data_offset;
+    uint32_t data_sz  = slot->data_size;
+    int64_t  pts      = slot->pts_ns;
+    int64_t  dts      = slot->dts_ns;
+    uint32_t flags    = slot->flags;
+
+    if (data_sz > out_buf_max) {
+        return CUFRAMES_ERR_INVALID_ARG; /* caller's buf too small */
+    }
+
+    /* Copy payload */
+    wraparound_memcpy_from((uint8_t *)out_buf,
+                            cuframes_pkt_data(hdr),
+                            hdr->data_size, data_off, data_sz);
+
+    /* Post-check: slot->seq не изменился во время copy. */
+    uint64_t s2 = atomic_load_explicit(&slot->seq, memory_order_acquire);
+    if (s2 != want_seq) {
+        return CUFRAMES_ERR_PACKET_OVERRUN;
+    }
+
+    *out_size  = data_sz;
+    *out_pts   = pts;
+    *out_dts   = dts;
+    *out_flags = flags;
+    *seq_inout = want_seq;
+    return CUFRAMES_OK;
+}
+
+/* ─── Cleanup ─────────────────────────────────────────────────────────── */
+
+void cuframes_internal_pkt_ring_destroy(cuframes_pkt_ring_t *ring) {
+    if (!ring) return;
+
+    if (ring->is_publisher && ring->hdr) {
+        /* Сигнализируем consumer'ам shutdown */
+        atomic_store_explicit(&ring->hdr->shutdown_flag, 1,
+                              memory_order_release);
+    }
+
+    if (ring->shm_base && ring->shm_size > 0) {
+        munmap(ring->shm_base, ring->shm_size);
+    }
+    if (ring->shm_fd >= 0) {
+        close(ring->shm_fd);
+    }
+    if (ring->is_publisher && ring->shm_name[0] != '\0') {
+        shm_unlink(ring->shm_name);
+    }
+
+    memset(ring, 0, sizeof(*ring));
+    ring->shm_fd = -1;
+}
@@ -1,4 +1,14 @@
-/* Publisher implementation (docs/protocol.md §1, §2, §3.2, §4.2, §5). */
+/* Publisher implementation (docs/protocol.md §1, §2, §3.2, §4.2, §5).
+ *
+ * v0.4 — VMM + POSIX FD. Заменяет cudaMalloc+cudaIpcGetMemHandle на
+ * cuMemCreate + cuMemExportToShareableHandle(POSIX_FILE_DESCRIPTOR). FDs
+ * передаются consumer'у через SCM_RIGHTS, не нужны shared pid/ipc namespace.
+ *
+ * Sync (вместо cudaEventRecord+cudaIpcEventHandle): cuStreamSynchronize в
+ * do_publish — producer ждёт ~ms что stream flush'нулся, потом publishes seq.
+ * Consumer читает данные через DtoD копию без event wait — HW coherence
+ * гарантирована на одном GPU.
+ */

 #include "internal.h"
 #include <errno.h>
@@ -20,10 +30,18 @@ struct cuframes_publisher {
    char                socket_path[128];
    char                shm_name[80];

-    /* CUDA */
-    cudaEvent_t         event;
-    cudaIpcMemHandle_t  ipc_mem[CUFRAMES_MAX_RING];
-    void               *cuda_ptrs[CUFRAMES_MAX_RING];  /* mapped pointers */
+    /* v0.4 — VMM-allocated pool. Каждый slot: cuMemCreate → cuMemAddressReserve
+     * → cuMemMap → cuMemSetAccess. FD экспортируется один раз и передаётся всем
+     * subscribers через SCM_RIGHTS. */
+    CUmemGenericAllocationHandle vmm_handles[CUFRAMES_MAX_RING];
+    CUdeviceptr                  vmm_ptrs[CUFRAMES_MAX_RING];
+    int                          vmm_fds[CUFRAMES_MAX_RING];
+    size_t                       vmm_slot_size;   /* rounded к granularity */
+    int                          has_vmm_pool;
+
+    /* CUDA stream sync — заменяет per-slot events. Producer перед каждым publish
+     * вызывает cuStreamSynchronize чтобы гарантировать что previous writes
+     * завершены (data visible для consumer'ов на любом GPU stream). */
    size_t              frame_size_bytes;
    int32_t             ring_size_actual;

@@ -32,22 +50,31 @@ struct cuframes_publisher {
    int32_t             current_slot;  /* индекс slot'а полученного через acquire() */
    int                 has_acquired;

-    /* EXTERNAL ownership: map user pointer → ring index */
-    void               *external_ptrs[CUFRAMES_MAX_RING];
-    int32_t             external_count;
-
    /* Subscriber-management thread */
    pthread_t           accept_thread;
    int                 accept_thread_alive;
    int                 stop_flag;
    pthread_mutex_t     state_mu;  /* protects subscriber connections */
+
+    /* v0.2 — encoded packet ring (optional). is_pkt_ring=1 → активирован. */
+    int                 has_pkt_ring;
+    uint32_t            max_packet_size;
+    cuframes_pkt_ring_t pkt_ring;
 };

 /* Forward decls */
 static void *accept_thread_main(void *arg);
 static int handshake_subscriber(struct cuframes_publisher *pub, int client_fd);
+static void free_vmm_pool(struct cuframes_publisher *pub);

-/* ─── Internal: alloc/setup CUDA pool and SHM ─────────────────────────── */
+/* Helper: format CUresult error для CUFRAMES_LOG_ERROR */
+static const char *cu_err_str(CUresult r) {
+    const char *s = NULL;
+    cuGetErrorString(r, &s);
+    return s ? s : "?";
+}
+
+/* ─── Internal: alloc VMM pool + export POSIX FDs ─────────────────────── */

 static int alloc_library_pool(struct cuframes_publisher *pub) {
    int r = cuframes_internal_calc_size(pub->cfg.format,
@@ -56,7 +83,37 @@ static int alloc_library_pool(struct cuframes_publisher *pub) {
    if (r != CUFRAMES_OK) return r;

    pub->ring_size_actual = pub->cfg.ring_size;
+    for (int i = 0; i < CUFRAMES_MAX_RING; i++) pub->vmm_fds[i] = -1;

+    /* Initialize CUDA driver API context */
+    CUresult cr = cuInit(0);
+    if (cr != CUDA_SUCCESS) {
+        CUFRAMES_LOG_ERROR("cuInit: %s", cu_err_str(cr));
+        return CUFRAMES_ERR_CUDA;
+    }
+
+    /* Pick allocation prop: pinned device memory с POSIX FD handle */
+    CUmemAllocationProp prop = {0};
+    prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+    prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+    prop.location.id = pub->cfg.cuda_device;
+    prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+
+    /* Round slot size up to granularity */
+    size_t granularity = 0;
+    cr = cuMemGetAllocationGranularity(&granularity, &prop,
+                                        CU_MEM_ALLOC_GRANULARITY_MINIMUM);
+    if (cr != CUDA_SUCCESS) {
+        CUFRAMES_LOG_ERROR("cuMemGetAllocationGranularity: %s", cu_err_str(cr));
+        return CUFRAMES_ERR_CUDA;
+    }
+    pub->vmm_slot_size = ((pub->frame_size_bytes + granularity - 1) / granularity)
+                         * granularity;
+    CUFRAMES_LOG_INFO("VMM granularity=%zu frame=%zu slot=%zu",
+                       granularity, pub->frame_size_bytes, pub->vmm_slot_size);
+
+    /* Required: also need a runtime API context so that cudaMemcpyAsync from
+     * user works on this allocation. cudaSetDevice достаточно. */
    cudaError_t cerr = cudaSetDevice(pub->cfg.cuda_device);
    if (cerr != cudaSuccess) {
        CUFRAMES_LOG_ERROR("cudaSetDevice(%d): %s",
@@ -64,59 +121,68 @@ static int alloc_library_pool(struct cuframes_publisher *pub) {
        return CUFRAMES_ERR_CUDA;
    }

+    CUmemAccessDesc access = {0};
+    access.location = prop.location;
+    access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+
    for (int i = 0; i < pub->ring_size_actual; ++i) {
-        cerr = cudaMalloc(&pub->cuda_ptrs[i], pub->frame_size_bytes);
-        if (cerr != cudaSuccess) {
-            CUFRAMES_LOG_ERROR("cudaMalloc slot %d: %s",
-                                i, cudaGetErrorString(cerr));
+        cr = cuMemCreate(&pub->vmm_handles[i], pub->vmm_slot_size, &prop, 0);
+        if (cr != CUDA_SUCCESS) {
+            CUFRAMES_LOG_ERROR("cuMemCreate slot %d: %s", i, cu_err_str(cr));
+            free_vmm_pool(pub);
            return CUFRAMES_ERR_CUDA;
        }
-        cerr = cudaIpcGetMemHandle(&pub->ipc_mem[i], pub->cuda_ptrs[i]);
-        if (cerr != cudaSuccess) {
-            CUFRAMES_LOG_ERROR("cudaIpcGetMemHandle slot %d: %s",
-                                i, cudaGetErrorString(cerr));
+        cr = cuMemAddressReserve(&pub->vmm_ptrs[i], pub->vmm_slot_size, 0, 0, 0);
+        if (cr != CUDA_SUCCESS) {
+            CUFRAMES_LOG_ERROR("cuMemAddressReserve slot %d: %s", i, cu_err_str(cr));
+            free_vmm_pool(pub);
+            return CUFRAMES_ERR_CUDA;
+        }
+        cr = cuMemMap(pub->vmm_ptrs[i], pub->vmm_slot_size, 0,
+                       pub->vmm_handles[i], 0);
+        if (cr != CUDA_SUCCESS) {
+            CUFRAMES_LOG_ERROR("cuMemMap slot %d: %s", i, cu_err_str(cr));
+            free_vmm_pool(pub);
+            return CUFRAMES_ERR_CUDA;
+        }
+        cr = cuMemSetAccess(pub->vmm_ptrs[i], pub->vmm_slot_size, &access, 1);
+        if (cr != CUDA_SUCCESS) {
+            CUFRAMES_LOG_ERROR("cuMemSetAccess slot %d: %s", i, cu_err_str(cr));
+            free_vmm_pool(pub);
+            return CUFRAMES_ERR_CUDA;
+        }
+        /* Export POSIX FD — будет shared с consumers через SCM_RIGHTS */
+        cr = cuMemExportToShareableHandle((void *)&pub->vmm_fds[i],
+                                           pub->vmm_handles[i],
+                                           CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, 0);
+        if (cr != CUDA_SUCCESS) {
+            CUFRAMES_LOG_ERROR("cuMemExportToShareableHandle slot %d: %s",
+                                i, cu_err_str(cr));
+            free_vmm_pool(pub);
            return CUFRAMES_ERR_CUDA;
        }
    }
+    pub->has_vmm_pool = 1;
    return CUFRAMES_OK;
 }

-static int register_external_pool(struct cuframes_publisher *pub,
-                                   void *const *ptrs, int32_t count,
-                                   size_t frame_size) {
-    if (count < 1 || count > CUFRAMES_MAX_RING) return CUFRAMES_ERR_INVALID_ARG;
-    pub->frame_size_bytes = frame_size;
-    pub->ring_size_actual = count;
-    pub->external_count = count;
-
-    cudaError_t cerr = cudaSetDevice(pub->cfg.cuda_device);
-    if (cerr != cudaSuccess) {
-        CUFRAMES_LOG_ERROR("cudaSetDevice: %s", cudaGetErrorString(cerr));
-        return CUFRAMES_ERR_CUDA;
-    }
-    for (int i = 0; i < count; ++i) {
-        if (!ptrs[i]) return CUFRAMES_ERR_INVALID_ARG;
-        pub->cuda_ptrs[i] = ptrs[i];
-        pub->external_ptrs[i] = ptrs[i];
-        cerr = cudaIpcGetMemHandle(&pub->ipc_mem[i], ptrs[i]);
-        if (cerr != cudaSuccess) {
-            CUFRAMES_LOG_ERROR("cudaIpcGetMemHandle on external ptr %p: %s",
-                                ptrs[i], cudaGetErrorString(cerr));
-            return CUFRAMES_ERR_CUDA;
+static void free_vmm_pool(struct cuframes_publisher *pub) {
+    for (int i = 0; i < CUFRAMES_MAX_RING; i++) {
+        if (pub->vmm_fds[i] >= 0) {
+            close(pub->vmm_fds[i]);
+            pub->vmm_fds[i] = -1;
+        }
+        if (pub->vmm_ptrs[i]) {
+            cuMemUnmap(pub->vmm_ptrs[i], pub->vmm_slot_size);
+            cuMemAddressFree(pub->vmm_ptrs[i], pub->vmm_slot_size);
+            pub->vmm_ptrs[i] = 0;
+        }
+        if (pub->vmm_handles[i]) {
+            cuMemRelease(pub->vmm_handles[i]);
+            pub->vmm_handles[i] = 0;
        }
    }
-    return CUFRAMES_OK;
-}
-
-static int create_event_handle(struct cuframes_publisher *pub) {
-    cudaError_t cerr = cudaEventCreateWithFlags(&pub->event,
-        cudaEventDisableTiming | cudaEventInterprocess);
-    if (cerr != cudaSuccess) {
-        CUFRAMES_LOG_ERROR("cudaEventCreateWithFlags: %s",
-                            cudaGetErrorString(cerr));
-        return CUFRAMES_ERR_CUDA;
-    }
-    return CUFRAMES_OK;
+    pub->has_vmm_pool = 0;
 }

 static int setup_shm(struct cuframes_publisher *pub) {
@@ -134,7 +200,8 @@ static int setup_shm(struct cuframes_publisher *pub) {
                cuframes_shm_header_t tmp;
                ssize_t rb = read(existing, &tmp, sizeof(tmp));
                close(existing);
-                if (rb == (ssize_t)sizeof(tmp) && tmp.magic == CUFRAMES_MAGIC) {
+                if (rb == (ssize_t)sizeof(tmp) &&
+                    (tmp.magic == CUFRAMES_MAGIC || tmp.magic == CUFRAMES_MAGIC_LEGACY)) {
                    if (cuframes_internal_pid_alive((pid_t)tmp.producer_pid)) {
                        CUFRAMES_LOG_ERROR("publisher with key=%s already running (pid %lu)",
                            pub->key, (unsigned long)tmp.producer_pid);
@@ -167,7 +234,7 @@ static int setup_shm(struct cuframes_publisher *pub) {
    memset(pub->hdr, 0, sizeof(cuframes_shm_header_t));

    pub->hdr->magic = CUFRAMES_MAGIC;
-    pub->hdr->proto_version = CUFRAMES_PROTOCOL_V1;
+    pub->hdr->proto_version = CUFRAMES_PROTOCOL_V4;
    pub->hdr->lib_version_major = CUFRAMES_VERSION_MAJOR;
    pub->hdr->lib_version_minor = CUFRAMES_VERSION_MINOR;
    pub->hdr->lib_version_patch = CUFRAMES_VERSION_PATCH;
@@ -187,16 +254,11 @@ static int setup_shm(struct cuframes_publisher *pub) {
    pub->hdr->meta.pitch_uv = puv;
    pub->hdr->meta.frame_size_bytes = pub->frame_size_bytes;

-    /* Export event handle */
-    cudaError_t cerr = cudaIpcGetEventHandle(&pub->hdr->ipc_event_handle, pub->event);
-    if (cerr != cudaSuccess) {
-        CUFRAMES_LOG_ERROR("cudaIpcGetEventHandle: %s", cudaGetErrorString(cerr));
-        return CUFRAMES_ERR_CUDA;
-    }
-
-    /* Fill slot descriptors */
+    /* v0.4: legacy event fields в header не используются (cuStreamSynchronize
+     * заменяет IPC events). Memzero выше — достаточно. */
+    /* Slot descriptors — mem_handle поле deprecated (передаётся через FDs),
+     * только seq atomic нужен. */
    for (int i = 0; i < pub->ring_size_actual; ++i) {
-        pub->hdr->slots[i].mem_handle = pub->ipc_mem[i];
        atomic_store_explicit(&pub->hdr->slots[i].seq, UINT64_MAX,
                              memory_order_release);
    }
@@ -280,6 +342,7 @@ static int common_init(struct cuframes_publisher *pub,
    pub->next_seq = 0;
    pub->current_slot = -1;
    pub->has_acquired = 0;
+    for (int i = 0; i < CUFRAMES_MAX_RING; i++) pub->vmm_fds[i] = -1;
    pthread_mutex_init(&pub->state_mu, NULL);
    return CUFRAMES_OK;
 }
@@ -295,7 +358,6 @@ int cuframes_publisher_create(const cuframes_publisher_config_t *cfg,
    common_init(pub, cfg);

    if ((r = alloc_library_pool(pub)) != CUFRAMES_OK) goto fail;
-    if ((r = create_event_handle(pub)) != CUFRAMES_OK) goto fail;
    if ((r = setup_shm(pub)) != CUFRAMES_OK) goto fail;
    if ((r = setup_socket(pub)) != CUFRAMES_OK) goto fail;

@@ -307,7 +369,7 @@ int cuframes_publisher_create(const cuframes_publisher_config_t *cfg,
    }
    pub->accept_thread_alive = 1;

-    CUFRAMES_LOG_INFO("publisher '%s' ready (ring=%d, %dx%d, fmt=%d, lib-owned)",
+    CUFRAMES_LOG_INFO("publisher '%s' ready (ring=%d, %dx%d, fmt=%d, lib-owned, v0.4 VMM)",
                       pub->key, pub->ring_size_actual,
                       pub->cfg.width, pub->cfg.height, (int)pub->cfg.format);
    *out = pub;
@@ -323,37 +385,12 @@ int cuframes_publisher_create_external(const cuframes_publisher_config_t *cfg,
                                        int32_t ptr_count,
                                        size_t frame_size,
                                        cuframes_publisher_t **out) {
-    int r = validate_config(cfg);
-    if (r != CUFRAMES_OK) return r;
-    if (cfg->ownership != CUFRAMES_OWNERSHIP_EXTERNAL) return CUFRAMES_ERR_INVALID_ARG;
-    if (!cuda_ptrs || ptr_count < 1) return CUFRAMES_ERR_INVALID_ARG;
-    if (frame_size == 0) return CUFRAMES_ERR_INVALID_ARG;
-
-    struct cuframes_publisher *pub = calloc(1, sizeof(*pub));
-    if (!pub) return CUFRAMES_ERR_OUT_OF_MEMORY;
-    common_init(pub, cfg);
-
-    if ((r = register_external_pool(pub, cuda_ptrs, ptr_count, frame_size)) != CUFRAMES_OK)
-        goto fail;
-    if ((r = create_event_handle(pub)) != CUFRAMES_OK) goto fail;
-    if ((r = setup_shm(pub)) != CUFRAMES_OK) goto fail;
-    if ((r = setup_socket(pub)) != CUFRAMES_OK) goto fail;
-
-    pub->stop_flag = 0;
-    if (pthread_create(&pub->accept_thread, NULL, accept_thread_main, pub) != 0) {
-        r = CUFRAMES_ERR_INTERNAL;
-        goto fail;
-    }
-    pub->accept_thread_alive = 1;
-
-    CUFRAMES_LOG_INFO("publisher '%s' ready (external pool=%d, %dx%d, fmt=%d)",
-                       pub->key, ptr_count,
-                       pub->cfg.width, pub->cfg.height, (int)pub->cfg.format);
-    *out = pub;
-    return CUFRAMES_OK;
-fail:
-    cuframes_publisher_destroy(pub);
-    return r;
+    /* v0.4: external ownership больше не поддерживается. VMM API требует
+     * cuMemCreate-allocated memory; existing cudaMalloc-pointers нельзя
+     * export'нуть как POSIX FD. Use LIBRARY ownership. */
+    (void)cfg; (void)cuda_ptrs; (void)ptr_count; (void)frame_size; (void)out;
+    CUFRAMES_LOG_ERROR("EXTERNAL ownership не поддерживается в v0.4 (VMM-only)");
+    return CUFRAMES_ERR_INVALID_ARG;
 }

 int cuframes_publisher_acquire(cuframes_publisher_t *pub, void **cuda_ptr_out) {
@@ -374,27 +411,24 @@ int cuframes_publisher_acquire(cuframes_publisher_t *pub, void **cuda_ptr_out) {
            while (1) {
                uint64_t ack = atomic_load_explicit(&pub->hdr->slots[slot].ack_bitmap,
                                                     memory_order_acquire);
-                /* Если slot ещё не публикован (seq == UINT64_MAX) — пропустить ack check */
                uint64_t cur_seq = atomic_load_explicit(&pub->hdr->slots[slot].seq,
                                                         memory_order_acquire);
                if (cur_seq == UINT64_MAX || (ack & bitmap) == bitmap) break;
                if (deadline && cuframes_now_ns() > deadline) {
-                    /* Mark slow subscriber dead и continue */
                    uint64_t missing = bitmap & ~ack;
                    CUFRAMES_LOG_WARN("strict-wait timeout, slow subscribers bitmap=0x%lx",
                                       (unsigned long)missing);
-                    /* clear missing subscribers — TODO: send unsubscribe in v0.2 */
                    atomic_fetch_and_explicit(&pub->hdr->subscriber_bitmap,
                                               ~missing, memory_order_release);
                    break;
                }
-                struct timespec ts = {.tv_sec = 0, .tv_nsec = 100000}; /* 100µs */
+                struct timespec ts = {.tv_sec = 0, .tv_nsec = 100000};
                nanosleep(&ts, NULL);
            }
        }
    }

-    *cuda_ptr_out = pub->cuda_ptrs[slot];
+    *cuda_ptr_out = (void *)(uintptr_t)pub->vmm_ptrs[slot];
    pub->current_slot = slot;
    pub->has_acquired = 1;
    return CUFRAMES_OK;
@@ -402,10 +436,14 @@ int cuframes_publisher_acquire(cuframes_publisher_t *pub, void **cuda_ptr_out) {

 static int do_publish(cuframes_publisher_t *pub, int32_t slot,
                       void *stream, int64_t pts_ns) {
-    /* Record event on producer's stream */
-    cudaError_t cerr = cudaEventRecord(pub->event, (cudaStream_t)stream);
+    /* v0.4 — заменяет cudaEventRecord+IPC events на cuStreamSynchronize.
+     * Producer ждёт что stream flush'нулся (~1ms на 5090), потом publishes
+     * seq atomically. Consumer читает данные через DtoD memcpy без event
+     * wait — hardware coherence гарантирована на одном GPU. */
+    cudaError_t cerr = cudaStreamSynchronize((cudaStream_t)stream);
    if (cerr != cudaSuccess) {
-        CUFRAMES_LOG_ERROR("cudaEventRecord: %s", cudaGetErrorString(cerr));
+        CUFRAMES_LOG_ERROR("cudaStreamSynchronize (slot %d): %s",
+                            slot, cudaGetErrorString(cerr));
        return CUFRAMES_ERR_CUDA;
    }

@@ -438,44 +476,8 @@ int cuframes_publisher_publish(cuframes_publisher_t *pub, void *stream, int64_t

 int cuframes_publisher_publish_external(cuframes_publisher_t *pub,
                                         void *cuda_ptr, void *stream, int64_t pts_ns) {
-    if (!pub || !cuda_ptr) return CUFRAMES_ERR_INVALID_ARG;
-    if (pub->cfg.ownership != CUFRAMES_OWNERSHIP_EXTERNAL) return CUFRAMES_ERR_INVALID_ARG;
-
-    int32_t slot = -1;
-    for (int i = 0; i < pub->external_count; ++i) {
-        if (pub->external_ptrs[i] == cuda_ptr) { slot = i; break; }
-    }
-    if (slot < 0) {
-        CUFRAMES_LOG_ERROR("external pointer %p not registered", cuda_ptr);
-        return CUFRAMES_ERR_INVALID_ARG;
-    }
-
-    /* STRICT_WAIT — то же что в acquire, но per-publish */
-    if (pub->cfg.policy == CUFRAMES_POLICY_STRICT_WAIT) {
-        uint64_t bitmap = atomic_load_explicit(&pub->hdr->subscriber_bitmap,
-                                                memory_order_acquire);
-        if (bitmap != 0) {
-            int64_t deadline = pub->cfg.consumer_ack_timeout_ms > 0
-                ? cuframes_now_ns() + (int64_t)pub->cfg.consumer_ack_timeout_ms * 1000000LL
-                : 0;
-            while (1) {
-                uint64_t ack = atomic_load_explicit(&pub->hdr->slots[slot].ack_bitmap,
-                                                     memory_order_acquire);
-                uint64_t cur_seq = atomic_load_explicit(&pub->hdr->slots[slot].seq,
-                                                         memory_order_acquire);
-                if (cur_seq == UINT64_MAX || (ack & bitmap) == bitmap) break;
-                if (deadline && cuframes_now_ns() > deadline) {
-                    uint64_t missing = bitmap & ~ack;
-                    atomic_fetch_and_explicit(&pub->hdr->subscriber_bitmap,
-                                               ~missing, memory_order_release);
-                    break;
-                }
-                struct timespec ts = {.tv_sec = 0, .tv_nsec = 100000};
-                nanosleep(&ts, NULL);
-            }
-        }
-    }
-    return do_publish(pub, slot, stream, pts_ns);
+    (void)pub; (void)cuda_ptr; (void)stream; (void)pts_ns;
+    return CUFRAMES_ERR_INVALID_ARG;  /* v0.4 — нет external mode */
 }

 int cuframes_publisher_destroy(cuframes_publisher_t *pub) {
@@ -497,13 +499,16 @@ int cuframes_publisher_destroy(cuframes_publisher_t *pub) {
        pub->accept_thread_alive = 0;
    }

-    /* Free CUDA */
-    if (pub->cfg.ownership == CUFRAMES_OWNERSHIP_LIBRARY) {
-        for (int i = 0; i < pub->ring_size_actual; ++i) {
-            if (pub->cuda_ptrs[i]) cudaFree(pub->cuda_ptrs[i]);
-        }
+    /* Free VMM */
+    if (pub->has_vmm_pool) {
+        free_vmm_pool(pub);
+    }
+
+    /* Packet ring cleanup (если активирован) */
+    if (pub->has_pkt_ring) {
+        cuframes_internal_pkt_ring_destroy(&pub->pkt_ring);
+        pub->has_pkt_ring = 0;
    }
-    if (pub->event) cudaEventDestroy(pub->event);

    /* Unlink resources */
    if (pub->hdr) {
@@ -523,8 +528,83 @@ int cuframes_publisher_destroy(cuframes_publisher_t *pub) {
    return CUFRAMES_OK;
 }

+/* ─────────────────────────────────────────────────────────────────────── */
+/* v0.2 — encoded packet ring API (см. docs/protocol.md §10)                */
+/* ─────────────────────────────────────────────────────────────────────── */
+
+int cuframes_publisher_enable_packets(cuframes_publisher_t *pub,
+                                       const cuframes_packet_ring_options_t *opts) {
+    if (!pub) return CUFRAMES_ERR_INVALID_ARG;
+    if (pub->has_pkt_ring) return CUFRAMES_ERR_ALREADY_EXISTS;
+
+    uint32_t slots     = opts && opts->ring_slots ? opts->ring_slots
+                                                  : CUFRAMES_PKT_DEFAULT_SLOTS;
+    uint32_t data_size = opts && opts->data_size  ? opts->data_size
+                                                  : CUFRAMES_PKT_DEFAULT_DATA_SIZE;
+    uint32_t max_pkt   = opts && opts->max_packet_size ? opts->max_packet_size
+                                                       : CUFRAMES_PKT_DEFAULT_MAX_SIZE;
+    uint32_t codec_id  = opts ? opts->codec_id : 0;
+
+    if (max_pkt > data_size) {
+        CUFRAMES_LOG_ERROR("max_packet_size (%u) > data_size (%u)", max_pkt, data_size);
+        return CUFRAMES_ERR_INVALID_ARG;
+    }
+
+    int r = cuframes_internal_pkt_ring_create(pub->key, slots, data_size,
+                                               codec_id, &pub->pkt_ring);
+    if (r != CUFRAMES_OK) return r;
+
+    pub->has_pkt_ring = 1;
+    pub->max_packet_size = max_pkt;
+    /* v0.4 frame header proto не bumped из-за packet ring — оба коэкзистируют. */
+    return CUFRAMES_OK;
+}
+
+int cuframes_publisher_set_codec_extradata(cuframes_publisher_t *pub,
+                                            const void *extradata, size_t size) {
+    if (!pub) return CUFRAMES_ERR_INVALID_ARG;
+    if (!pub->has_pkt_ring) return CUFRAMES_ERR_NO_PACKET_RING;
+    return cuframes_internal_pkt_ring_set_extradata(&pub->pkt_ring,
+                                                     extradata, size);
+}
+
+int cuframes_publisher_publish_packet(cuframes_publisher_t *pub,
+                                       const void *data, size_t size,
+                                       int64_t pts_ns, int64_t dts_ns,
+                                       uint32_t flags) {
+    if (!pub) return CUFRAMES_ERR_INVALID_ARG;
+    if (!pub->has_pkt_ring) return CUFRAMES_ERR_NO_PACKET_RING;
+    if (size > pub->max_packet_size) return CUFRAMES_ERR_PACKET_OVERSIZED;
+    return cuframes_internal_pkt_ring_publish(&pub->pkt_ring, data, size,
+                                               pts_ns, dts_ns, flags);
+}
+
 /* ─── Accept thread + handshake ──────────────────────────────────────── */

+struct sub_monitor_args {
+    struct cuframes_publisher *pub;
+    int fd;
+    uint32_t bit;
+};
+
+static void *subscriber_monitor_thread(void *arg) {
+    struct sub_monitor_args *m = (struct sub_monitor_args *)arg;
+    char buf[64];
+    while (1) {
+        ssize_t n = recv(m->fd, buf, sizeof(buf), 0);
+        if (n <= 0) {
+            atomic_fetch_and_explicit(&m->pub->hdr->subscriber_bitmap,
+                ~(1ULL << m->bit), memory_order_release);
+            atomic_store_explicit(&m->pub->hdr->subscribers[m->bit].state, 0,
+                memory_order_release);
+            close(m->fd);
+            CUFRAMES_LOG_INFO("subscriber bit=%u disconnected — freed", m->bit);
+            free(m);
+            return NULL;
+        }
+    }
+}
+
 static void *accept_thread_main(void *arg) {
    struct cuframes_publisher *pub = (struct cuframes_publisher *)arg;
    while (!pub->stop_flag) {
@@ -537,21 +617,16 @@ static void *accept_thread_main(void *arg) {
            CUFRAMES_LOG_WARN("accept: %s", strerror(errno));
            continue;
        }
-        /* Synchronous handshake — после ответа socket остаётся открытым для
-         * lifetime signals (SHUTDOWN, PING). Close на error. */
        int r = handshake_subscriber(pub, client);
        if (r != CUFRAMES_OK) {
            close(client);
        }
-        /* TODO v0.2: track client fds для broadcast SHUTDOWN. Сейчас clients
-         * сами detect socket EOF при publisher_destroy через shutdown(). */
    }
    return NULL;
 }

 static int allocate_subscriber_bit(struct cuframes_publisher *pub,
                                    const char *name, uint32_t *bit_out) {
-    /* Bit 0 reserved (sentinel). Bits 1..31. */
    pthread_mutex_lock(&pub->state_mu);
    for (uint32_t bit = 1; bit < CUFRAMES_MAX_SUBSCRIBERS; ++bit) {
        uint64_t state = atomic_load_explicit(&pub->hdr->subscribers[bit].state,
@@ -571,7 +646,6 @@ static int allocate_subscriber_bit(struct cuframes_publisher *pub,
            pthread_mutex_unlock(&pub->state_mu);
            return CUFRAMES_OK;
        }
-        /* Check for name collision */
        if (name && state >= 2 &&
            strncmp(pub->hdr->subscribers[bit].consumer_name, name,
                    sizeof(pub->hdr->subscribers[bit].consumer_name)) == 0) {
@@ -598,7 +672,6 @@ static int handshake_subscriber(struct cuframes_publisher *pub, int client_fd) {
        return CUFRAMES_ERR_PROTOCOL;
    }

-    /* Parse HELLO_REQ: proto_version + name_len + name + cuda_device + mode */
    if (plen < sizeof(cuframes_msg_hello_req_t) + 20) return CUFRAMES_ERR_PROTOCOL;
    cuframes_msg_hello_req_t *hreq = (cuframes_msg_hello_req_t *)buf;
    uint32_t want_proto = hreq->proto_version;
@@ -608,18 +681,18 @@ static int handshake_subscriber(struct cuframes_publisher *pub, int client_fd) {
    char name[32] = {0};
    memcpy(name, buf + sizeof(*hreq), name_len);

-    int proto_match = (want_proto == CUFRAMES_PROTOCOL_V1);
+    /* v0.4 принимает только V4 consumers. Старые v0.3 fail здесь cleanly. */
+    int proto_match = (want_proto == CUFRAMES_PROTOCOL_V4);

    /* Send HELLO_RESP */
    uint8_t resp_buf[CUFRAMES_MAX_MSG_PAYLOAD];
    cuframes_msg_hello_resp_t *resp = (cuframes_msg_hello_resp_t *)resp_buf;
    memset(resp, 0, sizeof(*resp));
    resp->result = proto_match ? CUFRAMES_OK : CUFRAMES_ERR_PROTOCOL;
-    resp->proto_version_actual = CUFRAMES_PROTOCOL_V1;
+    resp->proto_version_actual = CUFRAMES_PROTOCOL_V4;
    resp->ring_size = (uint32_t)pub->ring_size_actual;
    resp->ownership_mode = (uint32_t)pub->cfg.ownership;
    resp->meta = pub->hdr->meta;
-    /* shm_path */
    int slen = snprintf((char *)(resp_buf + sizeof(*resp)),
                         sizeof(resp_buf) - sizeof(*resp) - 12,
                         "%s", pub->shm_name);
@@ -632,7 +705,11 @@ static int handshake_subscriber(struct cuframes_publisher *pub, int client_fd) {
        CUFRAMES_LOG_WARN("send HELLO_RESP: %s", cuframes_strerror(r));
        return r;
    }
-    if (!proto_match) return CUFRAMES_ERR_PROTOCOL;
+    if (!proto_match) {
+        CUFRAMES_LOG_WARN("subscriber proto v%u rejected (want v%u)",
+                          want_proto, CUFRAMES_PROTOCOL_V4);
+        return CUFRAMES_ERR_PROTOCOL;
+    }

    /* recv SUBSCRIBE_REQ */
    plen = sizeof(buf);
@@ -640,11 +717,9 @@ static int handshake_subscriber(struct cuframes_publisher *pub, int client_fd) {
    if (r != CUFRAMES_OK) return r;
    if (mtype != CUFRAMES_MSG_SUBSCRIBE_REQ) return CUFRAMES_ERR_PROTOCOL;

-    /* Allocate subscriber bit */
    uint32_t bit = 0;
    int alloc_r = allocate_subscriber_bit(pub, name, &bit);

-    /* Send SUBSCRIBE_RESP */
    cuframes_msg_subscribe_resp_t sresp = {0};
    sresp.result = alloc_r;
    sresp.assigned_bit = bit;
@@ -655,13 +730,42 @@ static int handshake_subscriber(struct cuframes_publisher *pub, int client_fd) {
                                     &sresp, sizeof(sresp));
    if (r != CUFRAMES_OK || alloc_r != CUFRAMES_OK) return r ? r : alloc_r;

-    /* Activate subscriber slot */
+    /* v0.4 — отправить VMM_FDS с N posix FDs через SCM_RIGHTS */
+    cuframes_msg_vmm_fds_t vmm_payload = {0};
+    vmm_payload.slot_size_bytes = pub->vmm_slot_size;
+    vmm_payload.fd_count = (uint32_t)pub->ring_size_actual;
+    r = cuframes_internal_send_msg_with_fds(client_fd, CUFRAMES_MSG_VMM_FDS,
+                                             &vmm_payload, sizeof(vmm_payload),
+                                             pub->vmm_fds,
+                                             (uint32_t)pub->ring_size_actual);
+    if (r != CUFRAMES_OK) {
+        CUFRAMES_LOG_WARN("send VMM_FDS: %s", cuframes_strerror(r));
+        /* roll back bit allocation */
+        atomic_fetch_and_explicit(&pub->hdr->subscriber_bitmap,
+            ~(1ULL << bit), memory_order_release);
+        atomic_store_explicit(&pub->hdr->subscribers[bit].state, 0,
+            memory_order_release);
+        return r;
+    }
+
    atomic_store_explicit(&pub->hdr->subscribers[bit].state, 2,
                           memory_order_release);

-    CUFRAMES_LOG_INFO("subscriber '%s' connected (bit=%u)", name, bit);
+    CUFRAMES_LOG_INFO("subscriber '%s' connected (bit=%u, %d VMM FDs)",
+                      name, bit, pub->ring_size_actual);

-    /* TODO v0.2: spawn per-client thread для liveness/PING/UNSUBSCRIBE.
-     * Сейчас socket остаётся открытым на heap'е до publisher_destroy. */
+    /* Spawn monitor thread */
+    struct sub_monitor_args *m = malloc(sizeof(*m));
+    if (!m) return CUFRAMES_OK;
+    m->pub = pub;
+    m->fd = client_fd;
+    m->bit = bit;
+    pthread_t monitor_tid;
+    if (pthread_create(&monitor_tid, NULL, subscriber_monitor_thread, m) != 0) {
+        CUFRAMES_LOG_WARN("monitor pthread_create fail — bit %u may leak", bit);
+        free(m);
+    } else {
+        pthread_detach(monitor_tid);
+    }
    return CUFRAMES_OK;
 }
@@ -3,7 +3,9 @@
 #include "internal.h"
 #include <errno.h>
 #include <poll.h>
+#include <string.h>
 #include <sys/socket.h>
+#include <sys/uio.h>
 #include <unistd.h>

 /* Read exactly N bytes from socket, with poll-based timeout. */
@@ -97,3 +99,121 @@ int cuframes_internal_recv_msg(int fd, uint32_t *msg_type_out,
    if (payload_len_inout) *payload_len_inout = h.payload_length;
    return CUFRAMES_OK;
 }
+
+/* v0.4 — send TLV msg + N FDs через SCM_RIGHTS. Один sendmsg(): header+payload
+ * в iovec, FDs в control. Header.payload_length описывает ТОЛЬКО payload bytes,
+ * FDs приходят out-of-band. */
+int cuframes_internal_send_msg_with_fds(int sock_fd, uint32_t msg_type,
+                                         const void *payload, uint32_t payload_len,
+                                         const int *fds, uint32_t fd_count) {
+    if (payload_len > CUFRAMES_MAX_MSG_PAYLOAD) return CUFRAMES_ERR_INVALID_ARG;
+    if (fd_count > 0 && !fds) return CUFRAMES_ERR_INVALID_ARG;
+
+    cuframes_msg_header_t h = {.msg_type = msg_type, .payload_length = payload_len};
+
+    struct iovec iov[2];
+    iov[0].iov_base = &h;             iov[0].iov_len = sizeof(h);
+    iov[1].iov_base = (void *)payload; iov[1].iov_len = payload_len;
+
+    struct msghdr msg = {0};
+    msg.msg_iov = iov;
+    msg.msg_iovlen = (payload_len > 0 && payload) ? 2 : 1;
+
+    char ctrl_buf[CMSG_SPACE(sizeof(int) * 64)] = {0};
+    if (fd_count > 0) {
+        if (fd_count > 64) return CUFRAMES_ERR_INVALID_ARG;
+        msg.msg_control = ctrl_buf;
+        msg.msg_controllen = CMSG_SPACE(sizeof(int) * fd_count);
+        struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+        cmsg->cmsg_level = SOL_SOCKET;
+        cmsg->cmsg_type = SCM_RIGHTS;
+        cmsg->cmsg_len = CMSG_LEN(sizeof(int) * fd_count);
+        memcpy(CMSG_DATA(cmsg), fds, sizeof(int) * fd_count);
+    }
+
+    ssize_t n = sendmsg(sock_fd, &msg, MSG_NOSIGNAL);
+    if (n < 0) {
+        if (errno == EPIPE) return CUFRAMES_ERR_DISCONNECTED;
+        return CUFRAMES_ERR_IO;
+    }
+    /* Partial send rare для small payload — но обработаем gracefully */
+    size_t want = sizeof(h) + payload_len;
+    if ((size_t)n < want) {
+        return send_all(sock_fd, (uint8_t *)iov[0].iov_base + n,
+                        want - (size_t)n);
+    }
+    return CUFRAMES_OK;
+}
+
+int cuframes_internal_recv_msg_with_fds(int sock_fd, uint32_t *msg_type_out,
+                                         void *payload, uint32_t *payload_len_inout,
+                                         int *fds_out, uint32_t *fd_count_inout,
+                                         int32_t timeout_ms) {
+    /* Poll первым делом — recvmsg блокирующий, иначе тайм-аут не сработает. */
+    if (timeout_ms >= 0) {
+        struct pollfd pfd = {.fd = sock_fd, .events = POLLIN};
+        int pr = poll(&pfd, 1, timeout_ms);
+        if (pr == 0) return CUFRAMES_ERR_TIMEOUT;
+        if (pr < 0) return CUFRAMES_ERR_IO;
+    }
+
+    cuframes_msg_header_t h;
+    struct iovec iov[2];
+    iov[0].iov_base = &h;     iov[0].iov_len = sizeof(h);
+    iov[1].iov_base = payload; iov[1].iov_len = (payload && payload_len_inout) ? *payload_len_inout : 0;
+
+    uint32_t want_fds = (fd_count_inout && fds_out) ? *fd_count_inout : 0;
+    char ctrl_buf[CMSG_SPACE(sizeof(int) * 64)] = {0};
+    struct msghdr msg = {0};
+    msg.msg_iov = iov;
+    msg.msg_iovlen = (iov[1].iov_len > 0) ? 2 : 1;
+    msg.msg_control = ctrl_buf;
+    msg.msg_controllen = sizeof(ctrl_buf);
+
+    ssize_t n = recvmsg(sock_fd, &msg, 0);
+    if (n == 0) return CUFRAMES_ERR_DISCONNECTED;
+    if (n < 0) return CUFRAMES_ERR_IO;
+    if ((size_t)n < sizeof(h)) return CUFRAMES_ERR_PROTOCOL;
+
+    if (msg_type_out) *msg_type_out = h.msg_type;
+    if (h.payload_length > CUFRAMES_MAX_MSG_PAYLOAD) return CUFRAMES_ERR_PROTOCOL;
+
+    /* Если recvmsg вернул меньше payload_length — добираем через recv_all */
+    size_t got_payload = (size_t)n - sizeof(h);
+    if (h.payload_length > 0) {
+        if (!payload || !payload_len_inout || *payload_len_inout < h.payload_length) {
+            return CUFRAMES_ERR_INVALID_ARG;
+        }
+        if (got_payload < h.payload_length) {
+            int r = recv_all(sock_fd, (uint8_t *)payload + got_payload,
+                              h.payload_length - got_payload, timeout_ms);
+            if (r != CUFRAMES_OK) return r;
+        }
+        *payload_len_inout = h.payload_length;
+    } else if (payload_len_inout) {
+        *payload_len_inout = 0;
+    }
+
+    /* Parse SCM_RIGHTS FDs */
+    uint32_t got_fds = 0;
+    struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+    for (; cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)) {
+        if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
+            size_t blob = cmsg->cmsg_len - CMSG_LEN(0);
+            uint32_t n_fds = (uint32_t)(blob / sizeof(int));
+            if (got_fds + n_fds > want_fds) {
+                /* Close excess FDs чтобы не утекли */
+                for (uint32_t i = 0; i < n_fds; i++) {
+                    int f;
+                    memcpy(&f, CMSG_DATA(cmsg) + i * sizeof(int), sizeof(int));
+                    close(f);
+                }
+                continue;
+            }
+            memcpy(fds_out + got_fds, CMSG_DATA(cmsg), blob);
+            got_fds += n_fds;
+        }
+    }
+    if (fd_count_inout) *fd_count_inout = got_fds;
+    return CUFRAMES_OK;
+}
@@ -32,6 +32,10 @@ const char *cuframes_strerror(int err) {
        case CUFRAMES_ERR_FORMAT:         return "unsupported format or size mismatch";
        case CUFRAMES_ERR_WOULD_BLOCK:    return "would block";
        case CUFRAMES_ERR_TOO_MANY:       return "too many subscribers (max 32)";
+        case CUFRAMES_ERR_PACKET_OVERSIZED: return "packet exceeds max_packet_size";
+        case CUFRAMES_ERR_NO_PACKET_RING: return "publisher has no packet ring";
+        case CUFRAMES_ERR_NO_CODEC_PARAMS: return "codec extradata not set by publisher";
+        case CUFRAMES_ERR_PACKET_OVERRUN: return "packet ring overrun — resync on keyframe";
        case CUFRAMES_ERR_INTERNAL:       return "internal error (please report)";
        default:                          return "unknown error";
    }
@@ -83,6 +87,15 @@ int cuframes_internal_shm_name(const char *key, char *out, size_t out_size) {
    return CUFRAMES_OK;
 }

+int cuframes_internal_pkt_shm_name(const char *key, char *out, size_t out_size) {
+    int r = cuframes_internal_validate_key(key);
+    if (r != CUFRAMES_OK) return r;
+    int n = snprintf(out, out_size, "%s%s%s",
+                     CUFRAMES_SHM_PREFIX, key, CUFRAMES_PKT_SHM_SUFFIX);
+    if (n < 0 || (size_t)n >= out_size) return CUFRAMES_ERR_INVALID_ARG;
+    return CUFRAMES_OK;
+}
+
 int cuframes_internal_ensure_runtime_dir(void) {
    if (mkdir(CUFRAMES_RUNTIME_DIR, 0755) == 0) return CUFRAMES_OK;
    if (errno == EEXIST) return CUFRAMES_OK;
@@ -22,3 +22,11 @@ target_include_directories(test_stress PRIVATE
    ${CMAKE_SOURCE_DIR}/include)
 add_test(NAME stress_4consumer COMMAND test_stress)
 set_tests_properties(stress_4consumer PROPERTIES TIMEOUT 120)
+
+# v0.2 — packet ring tests (host-only, без CUDA в test-коде)
+add_executable(test_packet_ring test_packet_ring.c)
+target_link_libraries(test_packet_ring PRIVATE cuframes)
+target_include_directories(test_packet_ring PRIVATE
+    ${CMAKE_SOURCE_DIR}/include)
+add_test(NAME packet_ring_basic COMMAND test_packet_ring)
+set_tests_properties(packet_ring_basic PROPERTIES TIMEOUT 120)
@@ -0,0 +1,280 @@
+/* Stress test для encoded packet ring (v0.2).
+ *
+ * Сценарии:
+ *  1) Normal flow: 1 publisher × 1 subscriber × 2000 packets, varied sizes,
+ *     каждые 30 packets — KEY flag (имитация GOP). Subscriber проверяет:
+ *     - монотонные seq (без пропусков в этом тесте — fast consumer)
+ *     - data integrity через checksum (XOR fold)
+ *     - PTS/DTS monotonic, KEY flag доходит
+ *  2) Slow subscriber: publisher шлёт быстрее чем subscriber читает →
+ *     должен случиться OVERRUN, library resync'нет на keyframe.
+ *  3) Cleanup: после exit нет leaked SHM в /dev/shm.
+ *
+ * Без CUDA-зависимостей (packets host-side).
+ */
+#include <cuframes/cuframes.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <time.h>
+#include <unistd.h>
+
+#define KEY "test_pkt_ring"
+#define TOTAL_PACKETS 2000
+#define GOP_SIZE 30
+#define SMALL_PKT 4096
+#define LARGE_PKT (256 * 1024)
+
+#define CHECK(call) do { int _r = (call); if (_r != 0) { \
+    fprintf(stderr, "FAIL %s:%d (rc=%d): %s\n", __FILE__, __LINE__, _r, \
+        cuframes_strerror(_r)); exit(2); } } while (0)
+
+#define EXPECT_TRUE(cond) do { if (!(cond)) { \
+    fprintf(stderr, "EXPECT_TRUE failed at %s:%d: %s\n", \
+        __FILE__, __LINE__, #cond); exit(2); } } while (0)
+
+/* Сгенерировать payload: первые 8 байт = seq (little-endian), остальное pattern. */
+static void gen_payload(uint8_t *buf, size_t size, uint64_t seq) {
+    memcpy(buf, &seq, sizeof(seq));
+    for (size_t i = sizeof(seq); i < size; ++i) {
+        buf[i] = (uint8_t)((seq + i) & 0xFF);
+    }
+}
+
+/* Verify payload matches seq. Возвращает 0 если ok. */
+static int verify_payload(const uint8_t *buf, size_t size, uint64_t expected_seq) {
+    uint64_t seq_in_buf;
+    if (size < sizeof(seq_in_buf)) return -1;
+    memcpy(&seq_in_buf, buf, sizeof(seq_in_buf));
+    if (seq_in_buf != expected_seq) return -2;
+    for (size_t i = sizeof(seq_in_buf); i < size; ++i) {
+        if (buf[i] != (uint8_t)((expected_seq + i) & 0xFF)) return -3;
+    }
+    return 0;
+}
+
+static cuframes_publisher_t *make_publisher(void) {
+    cuframes_publisher_config_t cfg = {0};
+    cfg.key = KEY;
+    cfg.width = 320;
+    cfg.height = 240;
+    cfg.format = CUFRAMES_FORMAT_NV12;
+    cfg.ownership = CUFRAMES_OWNERSHIP_LIBRARY;
+    cfg.ring_size = 2;
+    cfg.policy = CUFRAMES_POLICY_DROP_OLDEST;
+    cfg.cuda_device = 0;
+    cuframes_publisher_t *pub = NULL;
+    CHECK(cuframes_publisher_create(&cfg, &pub));
+
+    cuframes_packet_ring_options_t pkt_opts = {0};
+    pkt_opts.codec_id = 27;  /* AV_CODEC_ID_H264 */
+    pkt_opts.ring_slots = 64;
+    pkt_opts.data_size = 8 * 1024 * 1024;
+    pkt_opts.max_packet_size = LARGE_PKT * 2;
+    CHECK(cuframes_publisher_enable_packets(pub, &pkt_opts));
+
+    /* Fake SPS/PPS — 16 байт */
+    uint8_t extradata[16];
+    for (int i = 0; i < 16; ++i) extradata[i] = (uint8_t)(0xAA + i);
+    CHECK(cuframes_publisher_set_codec_extradata(pub, extradata, sizeof(extradata)));
+    return pub;
+}
+
+/* Subscriber-процесс. read_delay_us позволяет имитировать slow consumer. */
+static int run_subscriber(int read_delay_us, int *out_received, int *out_overruns,
+                          int *out_first_key_seq) {
+    /* Wait чтобы publisher успел создать SHM */
+    usleep(100 * 1000);
+
+    cuframes_subscriber_config_t cfg = {0};
+    cfg.key = KEY;
+    cfg.mode = CUFRAMES_MODE_NEWEST_ONLY;
+    cfg.cuda_device = 0;
+    cfg.connect_timeout_ms = 5000;
+    cuframes_subscriber_t *sub = NULL;
+    CHECK(cuframes_subscriber_create(&cfg, &sub));
+
+    CHECK(cuframes_subscriber_enable_packets(sub));
+
+    /* Verify codec params */
+    uint32_t codec_id = 0;
+    const void *extradata = NULL;
+    size_t extradata_sz = 0;
+    int r = cuframes_subscriber_get_codec_params(sub, &codec_id, &extradata, &extradata_sz);
+    EXPECT_TRUE(r == CUFRAMES_OK);
+    EXPECT_TRUE(codec_id == 27);
+    EXPECT_TRUE(extradata_sz == 16);
+
+    int received = 0;
+    int overruns = 0;
+    int first_key_seq = -1;
+    int64_t last_pts = -1;
+    int data_errors = 0;
+
+    /* Run на ~30s или до того как publisher закончит. */
+    time_t start = time(NULL);
+    while (time(NULL) - start < 30) {
+        cuframes_packet_t *pkt = NULL;
+        int rc = cuframes_subscriber_next_packet(sub, &pkt, 500);
+        if (rc == CUFRAMES_ERR_TIMEOUT || rc == CUFRAMES_ERR_WOULD_BLOCK) {
+            if (received >= TOTAL_PACKETS / 2) break; /* достаточно для теста */
+            continue;
+        }
+        if (rc == CUFRAMES_ERR_DISCONNECTED) break;
+        if (rc == CUFRAMES_ERR_PACKET_OVERRUN) {
+            overruns++;
+            continue; /* library resync'нет на next call */
+        }
+        if (rc != CUFRAMES_OK) {
+            fprintf(stderr, "next_packet rc=%d (%s)\n", rc, cuframes_strerror(rc));
+            break;
+        }
+
+        const uint8_t *data = (const uint8_t *)cuframes_packet_data(pkt);
+        size_t size = cuframes_packet_size(pkt);
+        int64_t pts = cuframes_packet_pts(pkt);
+        uint32_t flags = cuframes_packet_flags(pkt);
+        uint64_t seq = cuframes_packet_seq(pkt);
+
+        if (verify_payload(data, size, seq) != 0) {
+            data_errors++;
+        }
+
+        if ((flags & CUFRAMES_PKT_FLAG_KEY) && first_key_seq < 0) {
+            first_key_seq = (int)seq;
+        }
+        if (pts <= last_pts && last_pts >= 0) {
+            fprintf(stderr, "PTS не монотонно: %ld <= %ld (seq=%lu)\n",
+                    pts, last_pts, seq);
+        }
+        last_pts = pts;
+        received++;
+
+        cuframes_subscriber_release_packet(sub, pkt);
+
+        if (read_delay_us > 0) usleep(read_delay_us);
+    }
+
+    EXPECT_TRUE(data_errors == 0);
+    cuframes_subscriber_destroy(sub);
+
+    *out_received = received;
+    *out_overruns = overruns;
+    *out_first_key_seq = first_key_seq;
+    return 0;
+}
+
+static void publisher_loop(int total_packets, int inter_packet_us) {
+    cuframes_publisher_t *pub = make_publisher();
+
+    /* Buffer pre-alloc — max size */
+    uint8_t *buf = (uint8_t *)malloc(LARGE_PKT);
+    EXPECT_TRUE(buf != NULL);
+
+    for (int i = 0; i < total_packets; ++i) {
+        int is_key = (i % GOP_SIZE == 0);
+        size_t size = is_key ? LARGE_PKT : SMALL_PKT + (i % 8) * 1024;
+        gen_payload(buf, size, (uint64_t)i);
+
+        int64_t pts_ns = (int64_t)i * 33333333LL; /* ~30 fps */
+        uint32_t flags = is_key ? CUFRAMES_PKT_FLAG_KEY : 0;
+        int rc = cuframes_publisher_publish_packet(pub, buf, size,
+                                                    pts_ns, pts_ns, flags);
+        if (rc != CUFRAMES_OK) {
+            fprintf(stderr, "publish rc=%d size=%zu\n", rc, size);
+        }
+        if (inter_packet_us > 0) usleep(inter_packet_us);
+    }
+    free(buf);
+    cuframes_publisher_destroy(pub);
+}
+
+static int check_no_leaked_shm(void) {
+    int fail = 0;
+    char path[256];
+    snprintf(path, sizeof(path), "/dev/shm/cuframes-%s", KEY);
+    if (access(path, F_OK) == 0) {
+        fprintf(stderr, "LEAKED %s\n", path);
+        fail = 1;
+    }
+    snprintf(path, sizeof(path), "/dev/shm/cuframes-%s-packets", KEY);
+    if (access(path, F_OK) == 0) {
+        fprintf(stderr, "LEAKED %s\n", path);
+        fail = 1;
+    }
+    return fail;
+}
+
+static int scenario_normal_flow(void) {
+    fprintf(stderr, "[scenario 1] normal flow — fast consumer\n");
+
+    pid_t pid = fork();
+    EXPECT_TRUE(pid >= 0);
+    if (pid == 0) {
+        /* child = subscriber */
+        int received = 0, overruns = 0, first_key = -1;
+        run_subscriber(0, &received, &overruns, &first_key);
+        fprintf(stderr, "  consumer: received=%d overruns=%d first_key_seq=%d\n",
+                received, overruns, first_key);
+        EXPECT_TRUE(received >= TOTAL_PACKETS / 2);
+        EXPECT_TRUE(overruns == 0);
+        EXPECT_TRUE(first_key >= 0);
+        exit(0);
+    }
+
+    /* parent = publisher (медленнее чем consumer) */
+    publisher_loop(TOTAL_PACKETS, 1000); /* 1ms между packets = 1000 fps */
+    int status = 0;
+    waitpid(pid, &status, 0);
+    EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0);
+    return 0;
+}
+
+static int scenario_slow_consumer(void) {
+    fprintf(stderr, "[scenario 2] slow consumer — must hit OVERRUN + resync\n");
+
+    pid_t pid = fork();
+    EXPECT_TRUE(pid >= 0);
+    if (pid == 0) {
+        /* child = очень медленный subscriber */
+        int received = 0, overruns = 0, first_key = -1;
+        run_subscriber(10 * 1000, &received, &overruns, &first_key); /* 10ms */
+        fprintf(stderr, "  consumer: received=%d overruns=%d first_key_seq=%d\n",
+                received, overruns, first_key);
+        /* Должны быть overruns поскольку publisher faster */
+        EXPECT_TRUE(overruns > 0);
+        /* И всё-таки что-то получили (resync работает) */
+        EXPECT_TRUE(received > 10);
+        exit(0);
+    }
+
+    /* publisher fast — 200 fps */
+    publisher_loop(TOTAL_PACKETS, 5 * 1000);
+    int status = 0;
+    waitpid(pid, &status, 0);
+    EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0);
+    return 0;
+}
+
+int main(void) {
+    signal(SIGPIPE, SIG_IGN);
+
+    scenario_normal_flow();
+    /* Ensure clean inter-test state */
+    usleep(200 * 1000);
+    if (check_no_leaked_shm()) exit(2);
+
+    scenario_slow_consumer();
+    usleep(200 * 1000);
+    if (check_no_leaked_shm()) exit(2);
+
+    fprintf(stderr, "OK — all scenarios passed\n");
+    return 0;
+}
@@ -0,0 +1,7 @@
+build/
+dist/
+*.egg-info/
+__pycache__/
+*.pyc
+*.so
+.pytest_cache/
@@ -0,0 +1,52 @@
+# Python bindings for cuframes — pybind11 module.
+#
+# Buildup: используется как subdirectory из root CMakeLists.txt при
+# BUILD_PYTHON_BINDINGS=ON, либо standalone через scikit-build-core
+# (см. pyproject.toml).
+#
+# Output: единый shared module `_native.so` который импортируется из
+# Python package `cuframes` (cuframes/__init__.py re-export'ит публичный API).
+
+include(FetchContent)
+
+# pybind11 — header-only + helper functions. FetchContent чтобы не требовать
+# system install; pinned tag для воспроизводимых билдов.
+FetchContent_Declare(
+    pybind11
+    GIT_REPOSITORY https://github.com/pybind/pybind11.git
+    GIT_TAG v2.13.6
+    GIT_SHALLOW TRUE
+)
+FetchContent_MakeAvailable(pybind11)
+
+pybind11_add_module(_native MODULE
+    src/_native.cpp
+)
+
+target_include_directories(_native PRIVATE
+    ${PROJECT_SOURCE_DIR}/include
+)
+
+target_link_libraries(_native PRIVATE
+    cuframes  # imported target из libcuframes/CMakeLists.txt
+)
+
+# Версия модуля соответствует libcuframes (см. cuframes.h)
+target_compile_definitions(_native PRIVATE
+    CUFRAMES_PY_BINDING_VERSION="${PROJECT_VERSION}"
+)
+
+set_target_properties(_native PROPERTIES
+    CXX_STANDARD 17
+    CXX_STANDARD_REQUIRED ON
+    CXX_VISIBILITY_PRESET hidden
+    INTERPROCEDURAL_OPTIMIZATION TRUE
+)
+
+# При scikit-build-core билде модуль попадает в wheel рядом с Python-исходниками
+# пакета. При standalone CMake — устанавливается в site-packages по умолчанию.
+if(SKBUILD)
+    install(TARGETS _native DESTINATION cuframes)
+else()
+    install(TARGETS _native LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}/cuframes)
+endif()
@@ -0,0 +1,53 @@
+# cuframes — Python bindings
+
+Status: **WIP** (Phase 0 skeleton — issue [gx/cuframes#6](http://server:3000/gx/cuframes/issues/6))
+
+Это пакет Python-обёрток над `libcuframes` (C ABI). Цель — позволить
+downstream ML/CV пайплайнам (yolo-world-detector, zone-motion, custom
+скриптам) подписываться на cuframes без CPU round-trip: получать NV12
+frames прямо как CUDA pointer / `torch.Tensor` (DLPack export, zero-copy).
+
+## Текущий статус (что уже работает в этом skeleton)
+
+- Module import: `import cuframes` загружает `_native.so`
+- Версия: `cuframes.version_string()`, `cuframes.protocol_version()`
+- Enums: `PixelFormat`, `SubscriberMode`
+- Иерархия исключений: `CuframesError` + 8 subclasses (publisher gone,
+  frame timeout, device lost, и т. д.)
+
+## Что в работе (см. tasks #198-#202)
+
+- [ ] `CuframesSubscriber` + `CuframesFrame` lifecycle
+- [ ] DLPack export → `torch.from_dlpack`, `cupy.from_dlpack`
+- [ ] Context manager (`with cuframes.subscribe(key) as sub:`)
+- [ ] Per-subscriber CUDA stream
+- [ ] Health/stats properties (`ring_occupancy`, `drop_count`)
+- [ ] Thread-safety contract документация
+
+## Build (dev)
+
+Standalone wheel:
+
+```bash
+cd python/
+pip install -e . --no-build-isolation
+```
+
+Через корневой CMake-проект (вместе с libcuframes):
+
+```bash
+cmake -B build -DBUILD_PYTHON_BINDINGS=ON
+cmake --build build -j
+```
+
+## Зависимости
+
+- `libcuframes` ≥ 0.4 (линкуется из соседнего CMake target)
+- CUDA Toolkit 12+
+- `pybind11` 2.13+ (берётся через FetchContent при CMake-сборке)
+- Python 3.10+
+- Опционально: `torch>=2.4` или `cupy-cuda12x>=13` для DLPack-потребителей
+
+## Лицензия
+
+LGPL-2.1+ (как у libcuframes).
@@ -0,0 +1,77 @@
+"""cuframes — zero-copy CUDA frame sharing.
+
+Python bindings to libcuframes. См. docs/python.md (т.б.д.) для
+архитектуры, threading контракта и примеров интеграции с PyTorch/CuPy.
+
+Пример (subscriber-side):
+
+    import cuframes
+
+    with cuframes.subscribe("cam-parking",
+                            consumer_name="yolo-world",
+                            connect_timeout_ms=5000) as sub:
+        # next_frame returns CuframesFrame — context manager
+        with sub.next_frame(timeout_ms=1000) as frame:
+            print(frame.cuda_ptr, frame.width, frame.height,
+                  frame.pitch_y, frame.seq, frame.pts_ns)
+            # DLPack export — в task #199, пока через cuda-python:
+            # cuda_arr = cuda.from_pointer(frame.cuda_ptr, ...)
+
+Reconnect-loop пример:
+
+    while True:
+        try:
+            with cuframes.subscribe("cam-parking", connect_timeout_ms=5000) as sub:
+                while True:
+                    try:
+                        with sub.next_frame(timeout_ms=1000) as frame:
+                            process(frame)
+                    except cuframes.CuframesFrameTimeout:
+                        continue  # просто нет новых кадров
+        except cuframes.CuframesPublisherGone:
+            time.sleep(1)  # publisher restart — переподписываемся
+"""
+
+from ._native import (
+    # Метаданные
+    version_string,
+    protocol_version,
+    # Enums
+    PixelFormat,
+    SubscriberMode,
+    # Core API
+    CuframesSubscriber,
+    CuframesFrame,
+    subscribe,
+    # Error taxonomy
+    CuframesError,
+    CuframesPublisherGone,
+    CuframesFrameTimeout,
+    CuframesDeviceLost,
+    CuframesShmError,
+    CuframesProtocolMismatch,
+    CuframesInvalidArgument,
+    CuframesOutOfMemory,
+    CuframesInternal,
+)
+
+__version__ = version_string()
+
+__all__ = [
+    "version_string",
+    "protocol_version",
+    "PixelFormat",
+    "SubscriberMode",
+    "CuframesSubscriber",
+    "CuframesFrame",
+    "subscribe",
+    "CuframesError",
+    "CuframesPublisherGone",
+    "CuframesFrameTimeout",
+    "CuframesDeviceLost",
+    "CuframesShmError",
+    "CuframesProtocolMismatch",
+    "CuframesInvalidArgument",
+    "CuframesOutOfMemory",
+    "CuframesInternal",
+]
@@ -0,0 +1,47 @@
+[build-system]
+requires = [
+    "scikit-build-core>=0.10",
+    "pybind11>=2.13",
+]
+build-backend = "scikit_build_core.build"
+
+[project]
+name = "cuframes"
+version = "0.4.0"
+description = "Python bindings for cuframes — zero-copy CUDA frame sharing"
+readme = "README.md"
+license = { text = "LGPL-2.1+" }
+requires-python = ">=3.10"
+authors = [{ name = "Evgeny Demchenko", email = "demchenkoev@gmail.com" }]
+keywords = ["cuda", "video", "ipc", "zero-copy"]
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: GNU Lesser General Public License v2 or later (LGPLv2+)",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Topic :: Multimedia :: Video",
+]
+
+[project.optional-dependencies]
+torch = ["torch>=2.4"]
+cupy = ["cupy-cuda12x>=13"]
+dev = ["pytest>=8", "ruff>=0.6"]
+
+[tool.scikit-build]
+cmake.version = ">=3.20"
+cmake.build-type = "Release"
+build-dir = "build/{wheel_tag}"
+wheel.packages = ["cuframes"]
+# Будем строить только Python модуль; libcuframes собирается отдельно
+# в основном CMake-проекте и линкуется как imported target.
+cmake.args = ["-DBUILD_PYTHON_BINDINGS=ON", "-DBUILD_EXAMPLES=OFF", "-DBUILD_TOOLS=OFF"]
+cmake.source-dir = ".."
+
+[tool.scikit-build.cmake.define]
+BUILD_PYTHON_BINDINGS = "ON"
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
@@ -0,0 +1,757 @@
+// cuframes Python bindings — pybind11 entry point.
+//
+// Этот файл реализует core wrapper для subscriber-side API:
+//   - CuframesFrame   — owning handle одного frame'а, context manager
+//   - CuframesSubscriber — owning handle subscription'а, context manager
+//
+// DLPack export (#199), per-subscriber CUDA stream (#201), health/stats props
+// (#200) — добавляются в последующих коммитах в этот же файл.
+//
+// Контракт thread-safety (предварительный, финальный — task #202):
+//   - Каждый handle (CuframesSubscriber / CuframesFrame) принадлежит одному
+//     Python потоку. Cross-thread access = undefined behavior на C-уровне.
+//   - GIL отпускается на длинных I/O вызовах (next_frame) — другие Python
+//     потоки могут работать пока мы ждём frame.
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <cstring>
+#include <optional>
+#include <stdexcept>
+#include <string>
+
+#include "cuframes/cuframes.h"
+
+// DLPack — стандартный protocol для exchange tensor-like структур между
+// фреймворками (PyTorch/CuPy/JAX/TF). См. https://dmlc.github.io/dlpack/latest/
+// Мы embedим header inline чтобы не добавлять external dep — header
+// небольшой и стабильный (DLPack 1.0+).
+namespace dlpack {
+
+typedef enum {
+    kDLCPU = 1,
+    kDLCUDA = 2,
+} DLDeviceType;
+
+typedef struct {
+    DLDeviceType device_type;
+    int32_t device_id;
+} DLDevice;
+
+typedef enum {
+    kDLInt = 0U,
+    kDLUInt = 1U,
+    kDLFloat = 2U,
+} DLDataTypeCode;
+
+typedef struct {
+    uint8_t code;
+    uint8_t bits;
+    uint16_t lanes;
+} DLDataType;
+
+typedef struct {
+    void* data;
+    DLDevice device;
+    int32_t ndim;
+    DLDataType dtype;
+    int64_t* shape;
+    int64_t* strides;
+    uint64_t byte_offset;
+} DLTensor;
+
+typedef struct DLManagedTensor {
+    DLTensor dl_tensor;
+    void* manager_ctx;
+    void (*deleter)(struct DLManagedTensor* self);
+} DLManagedTensor;
+
+}  // namespace dlpack
+
+namespace py = pybind11;
+
+namespace {
+
+// ─────────────────────────────────────────────────────────────────────────────
+// Error taxonomy — Python exceptions, соответствующие cuframes_error_t.
+//
+// Принцип: каждая категория ошибок которая требует разной обработки в
+// downstream'е (reconnect vs retry vs fatal) → отдельный exception class.
+// Это решает требование из architect review: «detector должен уметь
+// reconnect-loop по publisher-gone, не падать».
+// ─────────────────────────────────────────────────────────────────────────────
+
+struct CuframesExceptions {
+    py::object base;
+    py::object publisher_gone;     // CUFRAMES_ERR_DISCONNECTED, _NOT_FOUND
+    py::object frame_timeout;      // CUFRAMES_ERR_TIMEOUT, _WOULD_BLOCK
+    py::object device_lost;        // CUFRAMES_ERR_CUDA
+    py::object shm_error;          // CUFRAMES_ERR_IO
+    py::object protocol_mismatch;  // CUFRAMES_ERR_PROTOCOL
+    py::object invalid_argument;   // CUFRAMES_ERR_INVALID_ARG
+    py::object out_of_memory;      // CUFRAMES_ERR_OUT_OF_MEMORY
+    py::object internal;           // CUFRAMES_ERR_INTERNAL, прочее
+};
+
+CuframesExceptions g_exc;
+
+// Маппинг cuframes_error_t → подходящий Python exception class.
+py::object exception_for(int err) {
+    switch (err) {
+        case CUFRAMES_ERR_NOT_FOUND:
+        case CUFRAMES_ERR_DISCONNECTED:
+            return g_exc.publisher_gone;
+        case CUFRAMES_ERR_TIMEOUT:
+        case CUFRAMES_ERR_WOULD_BLOCK:
+            return g_exc.frame_timeout;
+        case CUFRAMES_ERR_CUDA:
+            return g_exc.device_lost;
+        case CUFRAMES_ERR_IO:
+            return g_exc.shm_error;
+        case CUFRAMES_ERR_PROTOCOL:
+            return g_exc.protocol_mismatch;
+        case CUFRAMES_ERR_INVALID_ARG:
+            return g_exc.invalid_argument;
+        case CUFRAMES_ERR_OUT_OF_MEMORY:
+            return g_exc.out_of_memory;
+        default:
+            return g_exc.internal;
+    }
+}
+
+// Бросает подходящий exception если err != CUFRAMES_OK.
+void check(int err, const char* operation = nullptr) {
+    if (err == CUFRAMES_OK) return;
+    const char* msg = cuframes_strerror(err);
+    std::string what = operation
+        ? std::string(operation) + ": " + msg + " (code=" + std::to_string(err) + ")"
+        : std::string(msg) + " (code=" + std::to_string(err) + ")";
+    PyErr_SetString(exception_for(err).ptr(), what.c_str());
+    throw py::error_already_set();
+}
+
+// ─────────────────────────────────────────────────────────────────────────────
+// CuframesFrame — owning wrapper над cuframes_frame_t*.
+//
+// Lifecycle:
+//   - конструируется через Subscriber::next_frame() (single source of truth)
+//   - в destructor'е (или __exit__) автоматически вызывает release
+//   - после release() все property accessor'ы бросают CuframesError
+//   - non-copyable, non-movable из Python (PyObject identity)
+//
+// Frame держит **слабую** ссылку (raw pointer) на subscriber. Если subscriber
+// уничтожен раньше frame'а — released() становится no-op (subscriber разрулит
+// освобождение всех outstanding frames при cuframes_subscriber_destroy).
+// Чтобы избежать use-after-free, frame проверяет sub_alive_ через shared_ptr.
+//
+// Для простоты Phase 0 — frame и subscriber должны жить в одном Python потоке,
+// порядок destruction под управлением Python GC. Refcount на Python-стороне
+// от субскриптора держится через py::object атрибут.
+// ─────────────────────────────────────────────────────────────────────────────
+
+class FrameWrapper {
+public:
+    FrameWrapper(cuframes_subscriber_t* sub, cuframes_frame_t* frame)
+        : sub_(sub), frame_(frame) {}
+
+    ~FrameWrapper() {
+        try { release(); } catch (...) { /* destructor — глотаем */ }
+    }
+
+    // pybind11 не любит copyable wrappers для owning ресурсов.
+    FrameWrapper(const FrameWrapper&) = delete;
+    FrameWrapper& operator=(const FrameWrapper&) = delete;
+
+    bool released() const noexcept { return frame_ == nullptr; }
+
+    void release() {
+        if (frame_ != nullptr) {
+            // sub_ может быть nullptr если subscriber разорвал связь раньше —
+            // в этом случае release уже не нужен (subscriber всё освободил).
+            if (sub_ != nullptr) {
+                cuframes_subscriber_release(sub_, frame_);
+            }
+            frame_ = nullptr;
+        }
+    }
+
+    // Internal hook — subscriber говорит frame'у «я умираю, не release()ай».
+    void invalidate_subscriber() noexcept { sub_ = nullptr; }
+
+    // ── Properties ──────────────────────────────────────────────────────
+    // Все геттеры проверяют released() — иначе CuframesError.
+
+    void check_alive() const {
+        if (frame_ == nullptr) {
+            PyErr_SetString(g_exc.base.ptr(), "frame has been released");
+            throw py::error_already_set();
+        }
+    }
+
+    uintptr_t cuda_ptr() const {
+        check_alive();
+        return reinterpret_cast<uintptr_t>(cuframes_frame_cuda_ptr(frame_));
+    }
+
+    cuframes_format_t format() const {
+        check_alive();
+        return cuframes_frame_format(frame_);
+    }
+
+    int width() const {
+        check_alive();
+        int32_t w, h;
+        cuframes_frame_size(frame_, &w, &h);
+        return w;
+    }
+
+    int height() const {
+        check_alive();
+        int32_t w, h;
+        cuframes_frame_size(frame_, &w, &h);
+        return h;
+    }
+
+    int pitch_y() const {
+        check_alive();
+        return cuframes_frame_pitch_y(frame_);
+    }
+
+    int pitch_uv() const {
+        check_alive();
+        return cuframes_frame_pitch_uv(frame_);
+    }
+
+    uint64_t seq() const {
+        check_alive();
+        return cuframes_frame_seq(frame_);
+    }
+
+    int64_t pts_ns() const {
+        check_alive();
+        return cuframes_frame_pts_ns(frame_);
+    }
+
+    cuframes_subscriber_t* internal_sub() const noexcept { return sub_; }
+    cuframes_frame_t* internal_frame() const noexcept { return frame_; }
+
+private:
+    cuframes_subscriber_t* sub_;
+    cuframes_frame_t* frame_;
+};
+
+// ─────────────────────────────────────────────────────────────────────────────
+// DLPack export helpers.
+//
+// Кадр в NV12 состоит из 2 plane'ов: Y (uint8, H×W, pitch=pitch_y) и
+// UV interleaved (uint8, H/2×W, pitch=pitch_uv; W здесь = ширина в байтах
+// для interleaved U+V).
+//
+// Стратегия: даём пользователю 2 отдельных DLPack capsule на каждый plane.
+// Это стандартный pattern в PyTorch/CuPy (torchcodec, cuda-python).
+// UV offset вычисляется из pitch_y * height_aligned (NVDEC выравнивает
+// height до aligned значения — обычно высота уже aligned, но мы используем
+// видимую height из frame_size).
+//
+// Lifetime: deleter capsule освобождает только shape/strides arrays.
+// Сам CUDA pointer принадлежит frame'у — gone-frame должно быть released
+// **после** того как DLPack capsule destroyed. Чтобы не дать пользователю
+// shoot in foot, capsule.manager_ctx держит py::object на FrameWrapper
+// (увеличивает refcount), которое освобождается в deleter.
+// ─────────────────────────────────────────────────────────────────────────────
+
+struct DLPackContext {
+    py::object frame_keep_alive;  // CuframesFrame Python-side
+    std::vector<int64_t> shape;
+    std::vector<int64_t> strides;
+};
+
+static void dlpack_deleter(dlpack::DLManagedTensor* self) {
+    if (!self) return;
+    auto* ctx = static_cast<DLPackContext*>(self->manager_ctx);
+    if (ctx) {
+        // Releasing Python refcount требует GIL
+        py::gil_scoped_acquire gil;
+        delete ctx;
+    }
+    delete self;
+}
+
+static void dlpack_pycapsule_destructor(PyObject* capsule) {
+    if (PyCapsule_IsValid(capsule, "dltensor")) {
+        // Capsule НЕ был consumed downstream'ом (e.g. torch.from_dlpack).
+        // Нужно освободить managed tensor самим.
+        auto* mt = static_cast<dlpack::DLManagedTensor*>(
+            PyCapsule_GetPointer(capsule, "dltensor"));
+        if (mt && mt->deleter) {
+            mt->deleter(mt);
+        }
+    }
+    // Если PyCapsule имеет name "used_dltensor" — downstream взял ownership,
+    // мы ничего не делаем.
+}
+
+static py::capsule make_dlpack_capsule(
+    void* data,
+    int rows, int cols, int64_t row_stride_bytes,
+    int cuda_device,
+    py::object frame_keep_alive
+) {
+    auto* ctx = new DLPackContext;
+    ctx->frame_keep_alive = std::move(frame_keep_alive);
+    ctx->shape = {static_cast<int64_t>(rows), static_cast<int64_t>(cols)};
+    ctx->strides = {row_stride_bytes, 1};
+
+    auto* mt = new dlpack::DLManagedTensor;
+    mt->dl_tensor.data = data;
+    mt->dl_tensor.device = {dlpack::kDLCUDA, cuda_device};
+    mt->dl_tensor.ndim = 2;
+    mt->dl_tensor.dtype = {dlpack::kDLUInt, 8, 1};  // uint8
+    mt->dl_tensor.shape = ctx->shape.data();
+    mt->dl_tensor.strides = ctx->strides.data();
+    mt->dl_tensor.byte_offset = 0;
+    mt->manager_ctx = ctx;
+    mt->deleter = dlpack_deleter;
+
+    return py::capsule(mt, "dltensor", &dlpack_pycapsule_destructor);
+}
+
+// ─────────────────────────────────────────────────────────────────────────────
+// CuframesSubscriber — owning wrapper над cuframes_subscriber_t*.
+//
+// API:
+//   sub = cuframes.subscribe("cam-parking", consumer_name="yolo-world",
+//                            timeout_ms=5000)
+//   with sub:
+//       with sub.next_frame(timeout_ms=1000) as frame:
+//           do_something(frame.cuda_ptr, frame.width, frame.height)
+//   # sub.close() здесь автоматически
+//
+// Iteration (Phase 0.5):
+//   for frame in sub.frames(timeout_ms=1000):
+//       ...
+// ─────────────────────────────────────────────────────────────────────────────
+
+// Per-subscriber health stats. Phase 0 версия — counted в pybind layer
+// (cuframes C API не expose'ит ring_occupancy / drop_count напрямую).
+// Если в будущем cuframes расширит C API (cuframes_subscriber_get_stats),
+// добавим reads оттуда — но текущие counters остаются для совместимости
+// с тем что consumer'у видно через Python API.
+struct SubscriberStats {
+    uint64_t frames_received = 0;  // успешных next_frame()
+    uint64_t timeouts = 0;         // CUFRAMES_ERR_TIMEOUT / WOULD_BLOCK
+    uint64_t errors = 0;           // прочие fail'ы в next_frame()
+    uint64_t last_seq = 0;         // seq последнего полученного frame'а
+    uint64_t gap_count = 0;        // сколько раз seq[i] > seq[i-1] + 1
+                                   // (proxy для drop count в NEWEST_ONLY mode)
+    int64_t  last_frame_pts_ns = 0;
+};
+
+class SubscriberWrapper {
+public:
+    SubscriberWrapper(
+        const std::string& key,
+        std::optional<std::string> consumer_name,
+        cuframes_subscriber_mode_t mode,
+        int cuda_device,
+        int connect_timeout_ms,
+        uintptr_t consumer_stream
+    ) : key_(key),
+        consumer_name_(consumer_name.value_or("")),
+        mode_(mode),
+        cuda_device_(cuda_device),
+        consumer_stream_(reinterpret_cast<void*>(consumer_stream)) {
+
+        cuframes_subscriber_config_t cfg = {};
+        cfg.key = key_.c_str();
+        cfg.consumer_name = consumer_name.has_value() ? consumer_name_.c_str() : nullptr;
+        cfg.mode = mode_;
+        cfg.cuda_device = cuda_device_;
+        cfg.connect_timeout_ms = connect_timeout_ms;
+
+        // create — может быть блокирующим (ждёт publisher'а). GIL release.
+        int err;
+        {
+            py::gil_scoped_release rel;
+            err = cuframes_subscriber_create(&cfg, &sub_);
+        }
+        check(err, "cuframes_subscriber_create");
+    }
+
+    ~SubscriberWrapper() {
+        try { close(); } catch (...) { /* destructor — глотаем */ }
+    }
+
+    SubscriberWrapper(const SubscriberWrapper&) = delete;
+    SubscriberWrapper& operator=(const SubscriberWrapper&) = delete;
+
+    bool closed() const noexcept { return sub_ == nullptr; }
+
+    void close() {
+        if (sub_ != nullptr) {
+            cuframes_subscriber_destroy(sub_);
+            sub_ = nullptr;
+        }
+    }
+
+    void check_alive() const {
+        if (sub_ == nullptr) {
+            PyErr_SetString(g_exc.base.ptr(), "subscriber has been closed");
+            throw py::error_already_set();
+        }
+    }
+
+    // Возвращает new FrameWrapper. Caller владеет через Python GC.
+    // GIL release на время блокирующего вызова — другие потоки работают.
+    std::unique_ptr<FrameWrapper> next_frame(int timeout_ms) {
+        check_alive();
+        cuframes_frame_t* raw = nullptr;
+        int err;
+        {
+            py::gil_scoped_release rel;
+            // Используем persistent per-subscriber stream — все consumer'ы
+            // получают независимый cudaStreamWaitEvent, не серializуются
+            // через default stream.
+            err = cuframes_subscriber_next(sub_, consumer_stream_,
+                                            &raw, timeout_ms);
+        }
+        // Update health stats до check() — иначе при exception они не
+        // увеличатся, и оператору будет непонятно почему counters застыли.
+        if (err == CUFRAMES_OK) {
+            stats_.frames_received++;
+            uint64_t seq = cuframes_frame_seq(raw);
+            if (stats_.last_seq != 0 && seq > stats_.last_seq + 1) {
+                stats_.gap_count++;
+            }
+            stats_.last_seq = seq;
+            stats_.last_frame_pts_ns = cuframes_frame_pts_ns(raw);
+        } else if (err == CUFRAMES_ERR_TIMEOUT || err == CUFRAMES_ERR_WOULD_BLOCK) {
+            stats_.timeouts++;
+        } else {
+            stats_.errors++;
+        }
+        check(err, "cuframes_subscriber_next");
+        return std::make_unique<FrameWrapper>(sub_, raw);
+    }
+
+    const std::string& key() const { return key_; }
+    const std::string& consumer_name() const { return consumer_name_; }
+    cuframes_subscriber_mode_t mode() const { return mode_; }
+    int cuda_device() const { return cuda_device_; }
+    const SubscriberStats& stats() const { return stats_; }
+
+    // Snapshot stats как Python dict — для MQTT health publish.
+    py::dict stats_dict() const {
+        py::dict d;
+        d["frames_received"]   = stats_.frames_received;
+        d["timeouts"]          = stats_.timeouts;
+        d["errors"]            = stats_.errors;
+        d["last_seq"]          = stats_.last_seq;
+        d["gap_count"]         = stats_.gap_count;
+        d["last_frame_pts_ns"] = stats_.last_frame_pts_ns;
+        return d;
+    }
+
+    uintptr_t consumer_stream() const {
+        return reinterpret_cast<uintptr_t>(consumer_stream_);
+    }
+
+private:
+    cuframes_subscriber_t* sub_ = nullptr;
+    std::string key_;
+    std::string consumer_name_;
+    cuframes_subscriber_mode_t mode_;
+    int cuda_device_;
+    // CUDA stream — opaque cudaStream_t. Передаётся снаружи как int
+    // (полученный через cuda-python / torch.cuda.Stream._as_parameter_).
+    // nullptr = default stream (только для smoke-тестов; в продакшене
+    // консумерам надо иметь свой stream чтобы избежать serialization
+    // через default).
+    void* consumer_stream_ = nullptr;
+    SubscriberStats stats_{};
+};
+
+}  // namespace
+
+PYBIND11_MODULE(_native, m) {
+    m.doc() = "cuframes — zero-copy CUDA frame sharing (native bindings)";
+
+    // ── Версия ──────────────────────────────────────────────────────────
+    m.def("version_string", []() {
+        return std::string(cuframes_version_string());
+    }, "Runtime version of libcuframes (MAJOR.MINOR.PATCH).");
+
+    m.def("protocol_version", []() {
+        return static_cast<uint32_t>(cuframes_protocol_version());
+    }, "Wire-protocol version. Subscribers с разной версией не подключатся.");
+
+    m.attr("__binding_version__") = CUFRAMES_PY_BINDING_VERSION;
+
+    // ── Error taxonomy ──────────────────────────────────────────────────
+    // Иерархия:
+    //   CuframesError (base)
+    //   ├── CuframesPublisherGone
+    //   ├── CuframesFrameTimeout
+    //   ├── CuframesDeviceLost
+    //   ├── CuframesShmError
+    //   ├── CuframesProtocolMismatch
+    //   ├── CuframesInvalidArgument
+    //   ├── CuframesOutOfMemory
+    //   └── CuframesInternal
+    //
+    // py::exception<T>(...) уже возвращает py::object на сам Python class.
+    // Не вызываем .attr("__class__") — иначе получим metaclass.
+
+    g_exc.base = py::exception<std::runtime_error>(m, "CuframesError");
+    auto make_subexc = [&m](const char* name) -> py::object {
+        return py::exception<std::runtime_error>(m, name, g_exc.base.ptr());
+    };
+    g_exc.publisher_gone     = make_subexc("CuframesPublisherGone");
+    g_exc.frame_timeout      = make_subexc("CuframesFrameTimeout");
+    g_exc.device_lost        = make_subexc("CuframesDeviceLost");
+    g_exc.shm_error          = make_subexc("CuframesShmError");
+    g_exc.protocol_mismatch  = make_subexc("CuframesProtocolMismatch");
+    g_exc.invalid_argument   = make_subexc("CuframesInvalidArgument");
+    g_exc.out_of_memory      = make_subexc("CuframesOutOfMemory");
+    g_exc.internal           = make_subexc("CuframesInternal");
+
+    // ── Pixel formats (enum mirror) ─────────────────────────────────────
+    py::enum_<cuframes_format_t>(m, "PixelFormat")
+        .value("NV12",      CUFRAMES_FORMAT_NV12)
+        .value("YUV420P",   CUFRAMES_FORMAT_YUV420P)
+        .value("RGB",       CUFRAMES_FORMAT_RGB)
+        .value("BGR",       CUFRAMES_FORMAT_BGR)
+        .value("RGBA",      CUFRAMES_FORMAT_RGBA)
+        .value("GRAYSCALE", CUFRAMES_FORMAT_GRAYSCALE);
+
+    py::enum_<cuframes_subscriber_mode_t>(m, "SubscriberMode")
+        .value("NEWEST_ONLY",  CUFRAMES_MODE_NEWEST_ONLY)
+        .value("STRICT_ORDER", CUFRAMES_MODE_STRICT_ORDER);
+
+    // ── CuframesFrame ───────────────────────────────────────────────────
+    py::class_<FrameWrapper>(m, "CuframesFrame",
+        "Один кадр от cuframes publisher'а.\n\n"
+        "Получается через CuframesSubscriber.next_frame().\n"
+        "Поддерживает context manager — release() при выходе из with-блока.\n"
+        "Все property accessor'ы после release() бросают CuframesError.\n\n"
+        "Это handle на frame в ring buffer publisher'а — данные остаются\n"
+        "в shared memory publisher'а пока frame не released. Долго удерживать\n"
+        "frame нельзя: medленный consumer заставит publisher либо overwrite\n"
+        "(DROP_OLDEST policy), либо stall (STRICT_WAIT).")
+        // properties (read-only)
+        .def_property_readonly("cuda_ptr", &FrameWrapper::cuda_ptr,
+            "CUDA device pointer на frame data (uintptr_t). Read-only для\n"
+            "consumer'а. Используйте через cuda-python / cupy / torch.from_blob.")
+        .def_property_readonly("format", &FrameWrapper::format,
+            "PixelFormat (NV12 для NVDEC publisher'а).")
+        .def_property_readonly("width", &FrameWrapper::width)
+        .def_property_readonly("height", &FrameWrapper::height)
+        .def_property_readonly("pitch_y", &FrameWrapper::pitch_y,
+            "Pitch (байт на строку) для Y plane. ВАЖНО: для больших\n"
+            "разрешений (2688×1520, gate_lpr) pitch != width — kernel'ы\n"
+            "должны принимать pitch как параметр.")
+        .def_property_readonly("pitch_uv", &FrameWrapper::pitch_uv,
+            "Pitch для UV plane (NV12/YUV420P); 0 для форматов без UV.")
+        .def_property_readonly("seq", &FrameWrapper::seq,
+            "Sequence number — монотонная нумерация у publisher'а.")
+        .def_property_readonly("pts_ns", &FrameWrapper::pts_ns,
+            "Presentation timestamp от publisher'а (наносекунды, CLOCK_MONOTONIC).")
+        .def_property_readonly("released", &FrameWrapper::released)
+        .def("release", &FrameWrapper::release,
+            "Освободить frame обратно publisher'у (ACK).\n"
+            "После release() property accessor'ы бросают CuframesError.\n"
+            "Idempotent — повторный вызов no-op.")
+        // context manager
+        .def("__enter__", [](FrameWrapper& self) -> FrameWrapper& {
+            self.check_alive();
+            return self;
+        }, py::return_value_policy::reference_internal)
+        .def("__exit__", [](FrameWrapper& self, py::object, py::object, py::object) {
+            self.release();
+            return py::none();
+        })
+        .def("__repr__", [](const FrameWrapper& f) {
+            if (f.released()) return std::string("<CuframesFrame released>");
+            return std::string("<CuframesFrame seq=") + std::to_string(f.seq()) +
+                   " size=" + std::to_string(f.width()) + "x" + std::to_string(f.height()) + ">";
+        })
+        // ── DLPack export ───────────────────────────────────────────────
+        // Multi-plane formats (NV12, YUV420P) — экспортируем планы отдельно
+        // как 2D uint8 tensors. Consumer строит логику склейки сам.
+        // Для single-plane (RGB/BGR/RGBA/GRAYSCALE) — __dlpack__() работает.
+        .def("dlpack_y",
+            [](py::object self) -> py::capsule {
+                auto& f = self.cast<FrameWrapper&>();
+                f.check_alive();
+                void* ptr = cuframes_frame_cuda_ptr(f.internal_frame());
+                int32_t w, h;
+                cuframes_frame_size(f.internal_frame(), &w, &h);
+                int pitch = cuframes_frame_pitch_y(f.internal_frame());
+                // Для NV12/YUV420P width = ширина в пикселях, Y занимает W байт/строка.
+                // Pitch (физическая строка в памяти) может быть > W. Передаём как stride.
+                // cuda_device извлекаем не из frame (нет API) — фиксируем 0 для default;
+                // task #201 добавит per-subscriber stream и реальный device.
+                return make_dlpack_capsule(ptr, h, w, pitch, /*cuda_device=*/0, self);
+            },
+            "DLPack export Y-plane как 2D uint8 GPU tensor (shape=[H, W], stride=[pitch_y, 1]).\n"
+            "Работает для NV12, YUV420P, GRAYSCALE. Для других форматов — отдаёт первый plane.")
+        .def("dlpack_uv",
+            [](py::object self) -> py::capsule {
+                auto& f = self.cast<FrameWrapper&>();
+                f.check_alive();
+                auto fmt = cuframes_frame_format(f.internal_frame());
+                if (fmt != CUFRAMES_FORMAT_NV12) {
+                    PyErr_SetString(g_exc.invalid_argument.ptr(),
+                        "dlpack_uv() only supported for NV12 format");
+                    throw py::error_already_set();
+                }
+                void* base = cuframes_frame_cuda_ptr(f.internal_frame());
+                int32_t w, h;
+                cuframes_frame_size(f.internal_frame(), &w, &h);
+                int pitch_y = cuframes_frame_pitch_y(f.internal_frame());
+                int pitch_uv = cuframes_frame_pitch_uv(f.internal_frame());
+                // NV12 layout: Y plane занимает pitch_y * h bytes,
+                // UV plane (interleaved U+V) следует сразу за ним.
+                void* uv_ptr = static_cast<uint8_t*>(base) + (size_t)pitch_y * h;
+                // UV plane размеры: H/2 строк, W колонок (interleaved U+V байты).
+                return make_dlpack_capsule(uv_ptr, h / 2, w, pitch_uv, /*cuda_device=*/0, self);
+            },
+            "DLPack export UV-plane (interleaved) для NV12.\n"
+            "Shape=[H/2, W] uint8, stride=[pitch_uv, 1]. U и V interleaved\n"
+            "по байтам в последнем измерении (W = ширина в пикселях, но\n"
+            "каждый pixel = 2 байта U+V).")
+        .def("__dlpack__",
+            [](py::object self, py::object /*stream*/) -> py::capsule {
+                // PEP 3118 / DLPack protocol — single-plane access.
+                // Для NV12/YUV420P возвращает Y plane (это самый частый use
+                // case — motion detection / brightness работают только с Y).
+                // Если нужен UV — явно через .dlpack_uv().
+                auto& f = self.cast<FrameWrapper&>();
+                f.check_alive();
+                void* ptr = cuframes_frame_cuda_ptr(f.internal_frame());
+                int32_t w, h;
+                cuframes_frame_size(f.internal_frame(), &w, &h);
+                int pitch = cuframes_frame_pitch_y(f.internal_frame());
+                return make_dlpack_capsule(ptr, h, w, pitch, /*cuda_device=*/0, self);
+            },
+            py::arg("stream") = py::none(),
+            "DLPack protocol для torch.from_dlpack / cupy.from_dlpack.\n"
+            "Для NV12 возвращает Y plane. Для других planes — .dlpack_uv().")
+        .def("__dlpack_device__",
+            [](const FrameWrapper& f) -> py::tuple {
+                f.check_alive();
+                // (device_type, device_id) — kDLCUDA=2, device 0 (task #201).
+                return py::make_tuple(2, 0);
+            },
+            "DLPack device protocol — возвращает (kDLCUDA=2, device_id).");
+
+    // ── CuframesSubscriber ──────────────────────────────────────────────
+    py::class_<SubscriberWrapper>(m, "CuframesSubscriber",
+        "Subscription на cuframes publisher.\n\n"
+        "Создаётся через cuframes.subscribe(key, ...). Поддерживает context\n"
+        "manager — close() при выходе из with-блока.\n\n"
+        "Thread-safety contract:\n"
+        "  • Handle принадлежит одному Python потоку — создание и\n"
+        "    все вызовы (next_frame, close) должны быть в одном thread.\n"
+        "  • Несколько subscriber'ов в разных потоках — OK (каждому свой\n"
+        "    handle, свой CUDA stream).\n"
+        "  • Доступ к Frame после release() из другого потока — UB\n"
+        "    (cuframes_frame_t* указывает в ring buffer publisher'а, после\n"
+        "    release он может быть переписан).\n"
+        "  • Внутренний GIL отпускается на длинных I/O вызовах\n"
+        "    (subscriber_create, next_frame) — другие Python потоки могут\n"
+        "    выполняться параллельно пока мы ждём frame.\n\n"
+        "CUDA stream:\n"
+        "  consumer_stream передаётся как int (cudaStream_t как opaque\n"
+        "  pointer). Получается через cuda-python (cudart.cudaStreamCreate)\n"
+        "  или torch (torch.cuda.Stream()._as_parameter_). Если 0 —\n"
+        "  default stream (serialization risk при нескольких subscriber'ах\n"
+        "  в одном процессе).")
+        .def(py::init<const std::string&, std::optional<std::string>,
+                       cuframes_subscriber_mode_t, int, int, uintptr_t>(),
+             py::arg("key"),
+             py::arg("consumer_name") = py::none(),
+             py::arg("mode") = CUFRAMES_MODE_NEWEST_ONLY,
+             py::arg("cuda_device") = 0,
+             py::arg("connect_timeout_ms") = -1,
+             py::arg("consumer_stream") = 0,
+             "Создать subscription. Блокирует до publisher_ready или\n"
+             "connect_timeout_ms. -1 = ждать вечно, 0 = fail сразу.\n"
+             "consumer_stream: int representation cudaStream_t (0=default).")
+        .def_property_readonly("key", &SubscriberWrapper::key)
+        .def_property_readonly("consumer_name", &SubscriberWrapper::consumer_name)
+        .def_property_readonly("mode", &SubscriberWrapper::mode)
+        .def_property_readonly("cuda_device", &SubscriberWrapper::cuda_device)
+        .def_property_readonly("consumer_stream", &SubscriberWrapper::consumer_stream,
+            "Pointer на cudaStream_t (int). 0 = default stream.")
+        .def_property_readonly("closed", &SubscriberWrapper::closed)
+        .def("next_frame", &SubscriberWrapper::next_frame,
+             py::arg("timeout_ms") = -1,
+             "Получить следующий frame.\n\n"
+             "timeout_ms: -1 = ждать вечно; 0 = non-blocking\n"
+             "(CuframesFrameTimeout если нет данных); >0 = с таймаутом.\n\n"
+             "Возвращает CuframesFrame — context manager. Использовать через\n"
+             "`with sub.next_frame() as frame: ...` для гарантии release.")
+        .def("close", &SubscriberWrapper::close,
+             "Закрыть subscription. Idempotent.")
+        // ── Health / stats ──────────────────────────────────────────────
+        // Phase 0: counted в pybind layer (cuframes C API не expose'ит
+        // ring_occupancy / drop_count напрямую). Эти counters достаточно
+        // для MQTT health publisher / monitoring.
+        .def_property_readonly("frames_received",
+            [](const SubscriberWrapper& s) { return s.stats().frames_received; },
+            "Количество успешных next_frame() с момента subscribe.")
+        .def_property_readonly("timeouts",
+            [](const SubscriberWrapper& s) { return s.stats().timeouts; },
+            "Сколько раз next_frame() вернул CuframesFrameTimeout.")
+        .def_property_readonly("errors",
+            [](const SubscriberWrapper& s) { return s.stats().errors; },
+            "Сколько раз next_frame() упал с error (не timeout).")
+        .def_property_readonly("last_seq",
+            [](const SubscriberWrapper& s) { return s.stats().last_seq; },
+            "Sequence number последнего полученного frame'а.")
+        .def_property_readonly("gap_count",
+            [](const SubscriberWrapper& s) { return s.stats().gap_count; },
+            "Сколько раз seq[i] > seq[i-1] + 1 — proxy для drop count\n"
+            "в NEWEST_ONLY mode. В STRICT_ORDER должен оставаться 0.")
+        .def_property_readonly("last_frame_pts_ns",
+            [](const SubscriberWrapper& s) { return s.stats().last_frame_pts_ns; })
+        .def("stats",
+            [](const SubscriberWrapper& s) { return s.stats_dict(); },
+            "Snapshot всех health counters как dict — для MQTT health publish.")
+        // context manager
+        .def("__enter__", [](SubscriberWrapper& self) -> SubscriberWrapper& {
+            self.check_alive();
+            return self;
+        }, py::return_value_policy::reference_internal)
+        .def("__exit__", [](SubscriberWrapper& self, py::object, py::object, py::object) {
+            self.close();
+            return py::none();
+        })
+        .def("__repr__", [](const SubscriberWrapper& s) {
+            return std::string("<CuframesSubscriber key='") + s.key() +
+                   "' closed=" + (s.closed() ? "True" : "False") + ">";
+        });
+
+    // ── Module-level factory ────────────────────────────────────────────
+    // Удобный shortcut: cuframes.subscribe("cam-parking") вместо
+    // cuframes._native.CuframesSubscriber(...).
+    m.def("subscribe",
+        [](const std::string& key,
+           std::optional<std::string> consumer_name,
+           cuframes_subscriber_mode_t mode,
+           int cuda_device,
+           int connect_timeout_ms,
+           uintptr_t consumer_stream) {
+            return std::make_unique<SubscriberWrapper>(
+                key, consumer_name, mode, cuda_device,
+                connect_timeout_ms, consumer_stream);
+        },
+        py::arg("key"),
+        py::arg("consumer_name") = py::none(),
+        py::arg("mode") = CUFRAMES_MODE_NEWEST_ONLY,
+        py::arg("cuda_device") = 0,
+        py::arg("connect_timeout_ms") = -1,
+        py::arg("consumer_stream") = 0,
+        "Создать CuframesSubscriber. Shortcut для CuframesSubscriber(...).");
+}
@@ -0,0 +1,112 @@
+"""Smoke tests для cuframes Python bindings.
+
+В Phase 0 (skeleton) проверяем что:
+  - модуль импортируется
+  - версия читается
+  - error классы существуют и являются нормальной иерархией
+
+Subscriber / DLPack тесты появятся в следующих фазах
+(см. issue gx/cuframes#6, tasks #198+).
+"""
+
+import cuframes
+
+
+def test_version_format():
+    v = cuframes.version_string()
+    assert isinstance(v, str)
+    parts = v.split(".")
+    assert len(parts) >= 3
+    assert all(p.isdigit() for p in parts[:3])
+
+
+def test_protocol_version_is_uint():
+    pv = cuframes.protocol_version()
+    assert isinstance(pv, int)
+    assert pv >= 0
+
+
+def test_pixel_format_enum_members():
+    assert cuframes.PixelFormat.NV12.value == 0
+    assert cuframes.PixelFormat.YUV420P.value == 1
+
+
+def test_subscriber_mode_enum_members():
+    assert cuframes.SubscriberMode.NEWEST_ONLY.value == 0
+    assert cuframes.SubscriberMode.STRICT_ORDER.value == 1
+
+
+def test_error_hierarchy():
+    """Все subtype'ы наследуются от CuframesError."""
+    for sub in [
+        cuframes.CuframesPublisherGone,
+        cuframes.CuframesFrameTimeout,
+        cuframes.CuframesDeviceLost,
+        cuframes.CuframesShmError,
+        cuframes.CuframesProtocolMismatch,
+        cuframes.CuframesInvalidArgument,
+        cuframes.CuframesOutOfMemory,
+        cuframes.CuframesInternal,
+    ]:
+        assert issubclass(sub, cuframes.CuframesError)
+
+
+def test_subscriber_class_exposed():
+    """CuframesSubscriber/CuframesFrame exposed как public classes."""
+    assert hasattr(cuframes, "CuframesSubscriber")
+    assert hasattr(cuframes, "CuframesFrame")
+    assert hasattr(cuframes, "subscribe")
+
+
+def test_subscribe_to_missing_publisher_raises():
+    """Subscribe к несуществующему publisher → CuframesError (subclass)
+    после connect_timeout_ms.
+
+    Этот тест работает на любом хосте (без живого cuframes-pub) — мы
+    верифицируем что error path работает и маппит CUFRAMES_ERR_*
+    в правильный Python exception.
+    """
+    import pytest
+    with pytest.raises(cuframes.CuframesError):
+        cuframes.subscribe(
+            "definitely-not-existing-publisher-xyz",
+            connect_timeout_ms=100,
+        )
+
+
+def test_subscriber_repr_when_unable_to_connect():
+    """Лёгкий тест что repr не падает и close idempotent."""
+    import pytest
+    try:
+        sub = cuframes.subscribe("nope-xyz", connect_timeout_ms=100)
+    except cuframes.CuframesError:
+        return  # ожидаемо
+    pytest.fail("subscribe должно было выкинуть exception")
+
+
+def test_subscribe_accepts_consumer_stream_param():
+    """consumer_stream — uintptr (cudaStream_t).
+
+    Проверяем что параметр accepted; реальное использование требует
+    cuda-python / torch.cuda.Stream — это в integration тестах
+    yolo-world-detector'а.
+    """
+    import pytest
+    with pytest.raises(cuframes.CuframesError):
+        cuframes.subscribe(
+            "nope-xyz",
+            connect_timeout_ms=100,
+            consumer_stream=0,  # 0 = default stream
+        )
+
+
+def test_subscribe_kwargs_signature():
+    """Проверяем что у subscribe правильный набор kwargs."""
+    import inspect
+    # Pybind11-обёртки не дают inspect.signature, но help_doc отражает их.
+    doc = cuframes.subscribe.__doc__
+    assert "consumer_name" in doc
+    assert "mode" in doc
+    assert "cuda_device" in doc
+    assert "connect_timeout_ms" in doc
+    assert "consumer_stream" in doc
@@ -0,0 +1,4 @@
+vmm_fd_pingpong/producer
+vmm_fd_pingpong/consumer
+smoke_v04/smoke_pub
+smoke_v04/smoke_sub
@@ -0,0 +1,13 @@
+CFLAGS  = -O2 -Wall -I../../include -I/usr/local/cuda/include
+LDFLAGS = -L../../build-v04/libcuframes -lcuframes -L/usr/local/cuda/lib64 -lcudart -lcuda -lpthread -lrt
+
+all: smoke_pub smoke_sub
+
+smoke_pub: smoke_pub.c
+	gcc $(CFLAGS) -o $@ $< $(LDFLAGS)
+
+smoke_sub: smoke_sub.c
+	gcc $(CFLAGS) -o $@ $< $(LDFLAGS)
+
+clean:
+	rm -f smoke_pub smoke_sub
@@ -0,0 +1,55 @@
+/* v0.4 smoke test publisher — NV12 1920x1080 ring 4, fill каждый slot
+ * с pattern (i % 256), publish, infinite loop. */
+#include <cuframes/cuframes.h>
+#include <cuda_runtime.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <unistd.h>
+
+int main(int argc, char **argv) {
+    const char *key = argc > 1 ? argv[1] : "smoke";
+
+    cuframes_publisher_config_t cfg = {0};
+    cfg.key = key;
+    cfg.width = 1920;
+    cfg.height = 1080;
+    cfg.format = CUFRAMES_FORMAT_NV12;
+    cfg.ownership = CUFRAMES_OWNERSHIP_LIBRARY;
+    cfg.ring_size = 4;
+    cfg.policy = CUFRAMES_POLICY_DROP_OLDEST;
+    cfg.cuda_device = 0;
+
+    cuframes_publisher_t *pub = NULL;
+    int r = cuframes_publisher_create(&cfg, &pub);
+    if (r != CUFRAMES_OK) {
+        fprintf(stderr, "publisher create failed: %d (%s)\n", r, cuframes_strerror(r));
+        return 1;
+    }
+    fprintf(stderr, "publisher 'cuframes-%s' ready (v0.4 VMM)\n", key);
+
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    int i = 0;
+    while (1) {
+        void *ptr = NULL;
+        r = cuframes_publisher_acquire(pub, &ptr);
+        if (r != CUFRAMES_OK) { fprintf(stderr, "acquire: %d\n", r); break; }
+
+        uint8_t pattern = (uint8_t)(i & 0xFF);
+        cudaMemsetAsync(ptr, pattern, 1920 * 1080 * 3 / 2, stream);
+
+        r = cuframes_publisher_publish(pub, stream,
+                                        (int64_t)cuframes_now_ns());
+        if (r != CUFRAMES_OK) { fprintf(stderr, "publish: %d\n", r); break; }
+        i++;
+        if (i % 50 == 0) fprintf(stderr, "published %d frames\n", i);
+        struct timespec ts = {.tv_sec = 0, .tv_nsec = 40000000};  /* 25 fps */
+        nanosleep(&ts, NULL);
+    }
+
+    cudaStreamDestroy(stream);
+    cuframes_publisher_destroy(pub);
+    return 0;
+}
@@ -0,0 +1,63 @@
+/* v0.4 smoke subscriber — connect, read 100 frames, verify pattern, exit 0/1. */
+#include <cuframes/cuframes.h>
+#include <cuda_runtime.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+int main(int argc, char **argv) {
+    const char *key = argc > 1 ? argv[1] : "smoke";
+
+    cuframes_subscriber_config_t cfg = {0};
+    cfg.key = key;
+    cfg.consumer_name = "smoke-sub";
+    cfg.mode = CUFRAMES_MODE_NEWEST_ONLY;
+    cfg.cuda_device = 0;
+    cfg.connect_timeout_ms = 10000;
+
+    cuframes_subscriber_t *sub = NULL;
+    int r = cuframes_subscriber_create(&cfg, &sub);
+    if (r != CUFRAMES_OK) {
+        fprintf(stderr, "subscriber create failed: %d (%s)\n", r, cuframes_strerror(r));
+        return 1;
+    }
+    fprintf(stderr, "subscribed to '%s' (v0.4)\n", key);
+
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+    size_t check_size = 1024;  /* sample 1KB чтобы не тратить время */
+    uint8_t *host = malloc(check_size);
+
+    int frames = 0;
+    int good = 0;
+    while (frames < 100) {
+        cuframes_frame_t *f = NULL;
+        r = cuframes_subscriber_next(sub, stream, &f, 2000);
+        if (r != CUFRAMES_OK) {
+            fprintf(stderr, "next failed: %d (%s)\n", r, cuframes_strerror(r));
+            break;
+        }
+        cudaMemcpyAsync(host, cuframes_frame_cuda_ptr(f), check_size,
+                         cudaMemcpyDeviceToHost, stream);
+        cudaStreamSynchronize(stream);
+        uint8_t exp = host[0];
+        int mismatch = 0;
+        for (size_t i = 1; i < check_size; i++) {
+            if (host[i] != exp) { mismatch++; }
+        }
+        if (mismatch == 0) good++;
+        if (frames % 20 == 0) {
+            fprintf(stderr, "frame seq=%lu byte0=0x%02x mismatch=%d\n",
+                    (unsigned long)cuframes_frame_seq(f), exp, mismatch);
+        }
+        cuframes_subscriber_release(sub, f);
+        frames++;
+    }
+    free(host);
+    cudaStreamDestroy(stream);
+    cuframes_subscriber_destroy(sub);
+
+    fprintf(stderr, "DONE: %d/%d frames OK\n", good, frames);
+    return (good == frames && frames > 0) ? 0 : 1;
+}
@@ -0,0 +1,16 @@
+CC      = gcc
+CFLAGS  = -O2 -Wall -I/usr/local/cuda/include
+LDFLAGS = -L/usr/local/cuda/lib64 -lcuda
+
+all: producer consumer
+
+producer: producer.c common.h
+	$(CC) $(CFLAGS) -o $@ producer.c $(LDFLAGS)
+
+consumer: consumer.c common.h
+	$(CC) $(CFLAGS) -o $@ consumer.c $(LDFLAGS)
+
+clean:
+	rm -f producer consumer
+
+.PHONY: all clean
@@ -0,0 +1,69 @@
+# vmm_fd_pingpong — spike для cuframes v0.4
+
+Проверка: можно ли заменить CUDA IPC mem handles на VMM (cuMemCreate)
+ POSIX FD export, чтобы убрать требование shared pid/ipc namespaces
+между producer и consumer контейнерами.
+
+## Результат: ✅ работает
+
+Запуск 2 контейнеров без shared pid/ipc, только volume mount для
+unix-сокета:
+
+```
+producer: granularity=2097152
+producer: alloc size=16777216
+producer: exported fd=37 for handle
+producer: listening on /run/spike/pingpong.sock, awaiting consumer...
+
+consumer: connected to producer
+consumer: recv fd=38 size=16777216 magic=0xa7
+consumer: imported handle OK
+consumer: mapped + access OK
+consumer: verify mismatch=0/1048576  → ACK=O
+consumer: done (OK)
+```
+
+## Ключевые наблюдения
+
+- **Granularity на 5090 = 2 MB**. 1920×1080 NV12 (~3.1 MB) округлится до 4 MB.
+  16 slots × 4 камеры × +1 MB = +64 MB VRAM поверх текущих cuda IPC аллокаций.
+- **FD передаётся через `sendmsg(SCM_RIGHTS)`** — kernel прокидывает реальный FD
+  в receiver namespace, переименовывая в свободный номер. Volume mount unix
+  socket'а — единственное требование (`/run/cuframes` уже монтируется как shared).
+- **`cuMemImportFromShareableHandle`** принимает FD как `(void *)(uintptr_t)fd`.
+- **Доступ на consumer side требует `cuMemSetAccess` с правильным `CUmemLocation`** —
+  device id из своего `cuDeviceGet`, не наследуется от producer.
+
+## Замена events (упрощение этапа C)
+
+CUDA events для IPC не имеют POSIX FD path. Внедрять external semaphores
+(OPAQUE_FD) — отдельный API, другая sigal/wait семантика. **Вместо этого:**
+producer вызывает `cuStreamSynchronize(stream)` ПЕРЕД `atomic_store(seq)` в
+`do_publish`. Consumer тогда просто читает seq и копирует DtoD — без event wait.
+
+Overhead: ~1 ms на publish × 25 fps = 2.5% CPU time producer'а. Memory
+coherence гарантирована (один GPU, hardware ensures writes visible после
+stream sync).
+
+## Сборка
+
+```bash
+docker run --rm -v $PWD:/work -w /work nvidia/cuda:12.4.1-devel-ubuntu22.04 \
+    bash -c "apt-get install -y build-essential && make"
+```
+
+## Запуск теста
+
+```bash
+sudo mkdir -p /var/run/spike-pingpong && sudo chmod 777 /var/run/spike-pingpong
+
+docker run -d --name spike-prod --runtime=nvidia --gpus all \
+    -v $PWD:/work -v /var/run/spike-pingpong:/run/spike \
+    nvidia/cuda:12.4.1-base-ubuntu22.04 /work/producer
+
+docker run --rm --name spike-cons --runtime=nvidia --gpus all \
+    -v $PWD:/work -v /var/run/spike-pingpong:/run/spike \
+    nvidia/cuda:12.4.1-base-ubuntu22.04 /work/consumer
+
+docker logs spike-prod && docker rm -f spike-prod
+```
@@ -0,0 +1,20 @@
+#pragma once
+#include <cuda.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define POOL_SIZE (16 * 1024 * 1024)
+#define MAGIC_BYTE 0xA7
+#define SOCK_PATH "/run/spike/pingpong.sock"
+
+#define CHECK(expr) do {                                                       \
+    CUresult _r = (expr);                                                      \
+    if (_r != CUDA_SUCCESS) {                                                  \
+        const char *_msg = NULL;                                               \
+        cuGetErrorString(_r, &_msg);                                           \
+        fprintf(stderr, "%s:%d %s -> %d (%s)\n",                               \
+                __FILE__, __LINE__, #expr, (int)_r, _msg ? _msg : "?");        \
+        exit(1);                                                               \
+    }                                                                          \
+} while (0)
@@ -0,0 +1,97 @@
+#include "common.h"
+#include <errno.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <unistd.h>
+
+static int recv_fd(int sock, int *out_fd, uint64_t *out_size, uint8_t *out_magic) {
+    struct msghdr msg = {0};
+    char ctrl[CMSG_SPACE(sizeof(int))];
+    struct iovec iov[2];
+    iov[0].iov_base = out_size; iov[0].iov_len = sizeof(*out_size);
+    iov[1].iov_base = out_magic; iov[1].iov_len = sizeof(*out_magic);
+    msg.msg_iov = iov; msg.msg_iovlen = 2;
+    msg.msg_control = ctrl; msg.msg_controllen = sizeof(ctrl);
+    ssize_t n = recvmsg(sock, &msg, 0);
+    if (n < 0) { perror("recvmsg"); return -1; }
+    struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+    if (!cmsg || cmsg->cmsg_level != SOL_SOCKET || cmsg->cmsg_type != SCM_RIGHTS) {
+        fprintf(stderr, "no SCM_RIGHTS in msg\n");
+        return -1;
+    }
+    memcpy(out_fd, CMSG_DATA(cmsg), sizeof(int));
+    return 0;
+}
+
+int main(void) {
+    CHECK(cuInit(0));
+    CUdevice dev;
+    CHECK(cuDeviceGet(&dev, 0));
+    CUcontext ctx;
+    CHECK(cuCtxCreate(&ctx, 0, dev));
+
+    /* Connect to producer */
+    int sock = socket(AF_UNIX, SOCK_STREAM, 0);
+    if (sock < 0) { perror("socket"); return 1; }
+    struct sockaddr_un sa = {.sun_family = AF_UNIX};
+    strncpy(sa.sun_path, SOCK_PATH, sizeof(sa.sun_path) - 1);
+
+    for (int retry = 0; retry < 50; retry++) {
+        if (connect(sock, (struct sockaddr *)&sa, sizeof(sa)) == 0) break;
+        if (retry == 49) { perror("connect (final)"); return 1; }
+        usleep(100000);
+    }
+    fprintf(stderr, "consumer: connected to producer\n");
+
+    int fd = -1;
+    uint64_t size = 0;
+    uint8_t magic = 0;
+    if (recv_fd(sock, &fd, &size, &magic) < 0) return 1;
+    fprintf(stderr, "consumer: recv fd=%d size=%llu magic=0x%02x\n",
+            fd, (unsigned long long)size, magic);
+
+    CUmemGenericAllocationHandle mem;
+    CHECK(cuMemImportFromShareableHandle(&mem, (void *)(uintptr_t)fd,
+              CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR));
+    fprintf(stderr, "consumer: imported handle OK\n");
+
+    CUdeviceptr ptr;
+    CHECK(cuMemAddressReserve(&ptr, size, 0, 0, 0));
+    CHECK(cuMemMap(ptr, size, 0, mem, 0));
+
+    CUmemAccessDesc access = {0};
+    access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+    access.location.id = dev;
+    access.flags = CU_MEM_ACCESS_FLAGS_PROT_READ;
+    CHECK(cuMemSetAccess(ptr, size, &access, 1));
+    fprintf(stderr, "consumer: mapped + access OK\n");
+
+    /* Copy out 1MB чтобы убедиться что pattern там */
+    size_t check = size < (1 << 20) ? size : (1 << 20);
+    uint8_t *host = malloc(check);
+    CHECK(cuMemcpyDtoH(host, ptr, check));
+    CHECK(cuCtxSynchronize());
+
+    size_t mismatch = 0;
+    for (size_t i = 0; i < check; i++) {
+        if (host[i] != magic) mismatch++;
+    }
+    free(host);
+
+    char ack = (mismatch == 0) ? 'O' : 'X';
+    fprintf(stderr, "consumer: verify mismatch=%zu/%zu  → ACK=%c\n",
+            mismatch, check, ack);
+
+    write(sock, &ack, 1);
+    close(sock);
+    close(fd);
+
+    CHECK(cuMemUnmap(ptr, size));
+    CHECK(cuMemAddressFree(ptr, size));
+    CHECK(cuMemRelease(mem));
+    CHECK(cuCtxDestroy(ctx));
+
+    fprintf(stderr, "consumer: done (%s)\n", ack == 'O' ? "OK" : "FAIL");
+    return ack == 'O' ? 0 : 1;
+}
@@ -0,0 +1,103 @@
+#include "common.h"
+#include <errno.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <unistd.h>
+
+/* Send fd через SCM_RIGHTS вместе с (uint64_t size, uint8_t magic) payload. */
+static int send_fd(int sock, int fd, uint64_t size, uint8_t magic) {
+    struct msghdr msg = {0};
+    char ctrl[CMSG_SPACE(sizeof(int))];
+    struct iovec iov[2];
+    iov[0].iov_base = &size; iov[0].iov_len = sizeof(size);
+    iov[1].iov_base = &magic; iov[1].iov_len = sizeof(magic);
+    msg.msg_iov = iov; msg.msg_iovlen = 2;
+    msg.msg_control = ctrl; msg.msg_controllen = sizeof(ctrl);
+    struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+    cmsg->cmsg_level = SOL_SOCKET;
+    cmsg->cmsg_type = SCM_RIGHTS;
+    cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+    memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
+    ssize_t n = sendmsg(sock, &msg, 0);
+    if (n < 0) { perror("sendmsg"); return -1; }
+    return 0;
+}
+
+int main(void) {
+    CHECK(cuInit(0));
+    CUdevice dev;
+    CHECK(cuDeviceGet(&dev, 0));
+    CUcontext ctx;
+    CHECK(cuCtxCreate(&ctx, 0, dev));
+
+    CUmemAllocationProp prop = {0};
+    prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+    prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+    prop.location.id = dev;
+    prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+
+    size_t granularity = 0;
+    CHECK(cuMemGetAllocationGranularity(&granularity, &prop,
+              CU_MEM_ALLOC_GRANULARITY_MINIMUM));
+    fprintf(stderr, "producer: granularity=%zu\n", granularity);
+
+    size_t size = ((POOL_SIZE + granularity - 1) / granularity) * granularity;
+    fprintf(stderr, "producer: alloc size=%zu\n", size);
+
+    CUmemGenericAllocationHandle mem;
+    CHECK(cuMemCreate(&mem, size, &prop, 0));
+
+    CUdeviceptr ptr;
+    CHECK(cuMemAddressReserve(&ptr, size, 0, 0, 0));
+    CHECK(cuMemMap(ptr, size, 0, mem, 0));
+
+    CUmemAccessDesc access = {0};
+    access.location = prop.location;
+    access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+    CHECK(cuMemSetAccess(ptr, size, &access, 1));
+
+    /* Fill with MAGIC pattern */
+    CHECK(cuMemsetD8(ptr, MAGIC_BYTE, size));
+    CHECK(cuCtxSynchronize());
+
+    int fd;
+    CHECK(cuMemExportToShareableHandle(&fd, mem,
+              CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, 0));
+    fprintf(stderr, "producer: exported fd=%d for handle\n", fd);
+
+    /* Unix socket server */
+    unlink(SOCK_PATH);
+    int srv = socket(AF_UNIX, SOCK_STREAM, 0);
+    if (srv < 0) { perror("socket"); return 1; }
+    struct sockaddr_un sa = {.sun_family = AF_UNIX};
+    strncpy(sa.sun_path, SOCK_PATH, sizeof(sa.sun_path) - 1);
+    if (bind(srv, (struct sockaddr *)&sa, sizeof(sa)) < 0) { perror("bind"); return 1; }
+    if (listen(srv, 1) < 0) { perror("listen"); return 1; }
+
+    fprintf(stderr, "producer: listening on %s, awaiting consumer...\n", SOCK_PATH);
+    int cli = accept(srv, NULL, NULL);
+    if (cli < 0) { perror("accept"); return 1; }
+
+    if (send_fd(cli, fd, (uint64_t)size, MAGIC_BYTE) < 0) return 1;
+    fprintf(stderr, "producer: sent fd + size=%zu + magic=0x%02x\n",
+            size, MAGIC_BYTE);
+
+    /* Wait for consumer ACK */
+    char ack;
+    if (read(cli, &ack, 1) != 1) { perror("read ack"); return 1; }
+    fprintf(stderr, "producer: got ACK=0x%02x\n", (unsigned char)ack);
+
+    close(cli);
+    close(srv);
+    unlink(SOCK_PATH);
+    close(fd);
+
+    CHECK(cuMemUnmap(ptr, size));
+    CHECK(cuMemAddressFree(ptr, size));
+    CHECK(cuMemRelease(mem));
+    CHECK(cuCtxDestroy(ctx));
+
+    fprintf(stderr, "producer: done\n");
+    return ack == 'O' ? 0 : 1;
+}
@@ -60,6 +60,9 @@ struct Args {
    bool verbose = false;
    bool realtime = false;  // emulate -re у ffmpeg CLI: sleep по pts
    bool loop = false;      // loop input на eof (для file://)
+    bool enable_packet_ring = false;  // v0.2 — публиковать encoded packets
+    std::string policy = "drop";  // "drop" = DROP_OLDEST, "wait" = STRICT_WAIT
+    int ack_timeout_ms = 200;     // only used при policy=wait; <=0 = infinite (unsafe)
 };

 static void print_usage() {
@@ -75,6 +78,16 @@ static void print_usage() {
        "  --ring N            cuframes ring size (default 4, range 2..16)\n"
        "  --realtime          pace input по PTS (как ffmpeg -re; полезно для файла)\n"
        "  --loop              loop input на EOF (только для file://)\n"
+        "  --enable-packet-ring  v0.2: дополнительно публиковать encoded packets\n"
+        "                       (для consumer'ов с -c:v copy, Frigate record path)\n"
+        "  --policy MODE        drop (default) = DROP_OLDEST — producer wrap'ает ring\n"
+        "                       без ожидания consumer ack. Подходит для multi-consumer.\n"
+        "                       wait = STRICT_WAIT — producer ждёт ack от всех subscribers\n"
+        "                       перед overwrite. Безопаснее для frame integrity, но slow\n"
+        "                       consumer задерживает all (default ack-timeout 200ms).\n"
+        "  --ack-timeout-ms N   только при --policy wait. Max wait для ack (default 200).\n"
+        "                       <=0 = infinite — НЕ РЕКОМЕНДУЕТСЯ (dead consumer вешает\n"
+        "                       producer навсегда).\n"
        "  --verbose           debug logs\n"
        "  -h, --help          this help\n";
 }
@@ -92,11 +105,24 @@ static int parse_args(int argc, char **argv, Args &a) {
        else if (s == "--ring") a.ring_size = std::stoi(next());
        else if (s == "--realtime") a.realtime = true;
        else if (s == "--loop") a.loop = true;
+        else if (s == "--enable-packet-ring") a.enable_packet_ring = true;
+        else if (s == "--policy") a.policy = next();
+        else if (s == "--ack-timeout-ms") a.ack_timeout_ms = std::stoi(next());
        else if (s == "--verbose") a.verbose = true;
        else if (s == "-h" || s == "--help") { print_usage(); std::exit(0); }
        else { std::cerr << "Unknown arg: " << s << "\n"; print_usage(); std::exit(1); }
    }
    if (a.rtsp_url.empty() || a.key.empty()) { print_usage(); return 1; }
+    if (a.policy != "drop" && a.policy != "wait") {
+        std::cerr << "Invalid --policy '" << a.policy << "' (use drop|wait)\n";
+        return 1;
+    }
+    if (a.policy == "wait" && a.ack_timeout_ms <= 0) {
+        std::cerr << "WARNING: --policy wait + --ack-timeout-ms<=0 = infinite wait.\n"
+                  << "  Dead consumer повесит producer навсегда. Forcing к 200ms.\n"
+                  << "  Set явно --ack-timeout-ms 200 (или больше) чтобы убрать warning.\n";
+        a.ack_timeout_ms = 200;
+    }
    return 0;
 }

@@ -205,35 +231,54 @@ int main(int argc, char **argv) {
        return 2;
    }

-    /* Pre-allocate cuframes pool (NV12 — что nvdec выдаёт) */
+    /* Pre-allocate cuframes pool (NV12 — что nvdec выдаёт).
+     * v0.4: publisher сам аллоцирует через cuMemCreate (VMM). Раньше tool
+     * передавал external pool, но v0.4 не может export'нуть cudaMalloc-pointers
+     * как POSIX FD — VMM API требует cuMemCreate-allocated memory. */
    int32_t pitch_y = 0, pitch_uv = 0;
    size_t frame_size = cuframes::calc_frame_size(CUFRAMES_FORMAT_NV12,
                                                    width, height,
                                                    &pitch_y, &pitch_uv);

    cudaSetDevice(a.cuda_device);
-    std::vector<void *> pool(a.ring_size, nullptr);
-    for (int i = 0; i < a.ring_size; ++i) {
-        cudaError_t cerr = cudaMalloc(&pool[i], frame_size);
-        if (cerr != cudaSuccess) {
-            std::cerr << "cudaMalloc pool[" << i << "]: " << cudaGetErrorString(cerr) << "\n";
-            return 2;
-        }
-    }

    cuframes::PublisherOptions po;
    po.key = a.key;
    po.width = width;
    po.height = height;
    po.format = CUFRAMES_FORMAT_NV12;
-    po.policy = CUFRAMES_POLICY_DROP_OLDEST;
+    po.policy = (a.policy == "wait")
+        ? CUFRAMES_POLICY_STRICT_WAIT
+        : CUFRAMES_POLICY_DROP_OLDEST;
+    po.consumer_ack_timeout_ms = a.ack_timeout_ms;
    po.cuda_device = a.cuda_device;
-    po.ring_size = a.ring_size;  /* для logging */
+    po.ring_size = a.ring_size;

-    cuframes::Publisher pub(po, pool.data(), a.ring_size, frame_size);
+    cuframes::Publisher pub(po);   /* LIBRARY ownership — publisher owns VMM pool */
    std::cerr << "[cuframes-src] publisher 'cuframes-" << a.key
-              << "' ready, ring=" << a.ring_size
-              << " pool_size=" << frame_size << " bytes/frame\n";
+              << "' ready (v0.4 VMM), ring=" << a.ring_size
+              << " frame_size=" << frame_size << " bytes\n";
+
+    /* v0.2 — encoded packet ring (опционально). */
+    if (a.enable_packet_ring) {
+        cuframes_packet_ring_options_t pkt_opts{};
+        pkt_opts.codec_id = (uint32_t)vstream->codecpar->codec_id;
+        /* остальные поля = 0 → library использует defaults (64 slots, 8MiB, 2MiB max) */
+        pub.enable_packets(&pkt_opts);
+
+        if (vstream->codecpar->extradata_size > 0 && vstream->codecpar->extradata) {
+            pub.set_codec_extradata(vstream->codecpar->extradata,
+                                     (size_t)vstream->codecpar->extradata_size);
+            std::cerr << "[cuframes-src] packet ring active, codec_id="
+                      << vstream->codecpar->codec_id
+                      << " extradata=" << vstream->codecpar->extradata_size
+                      << " bytes\n";
+        } else {
+            std::cerr << "[cuframes-src] packet ring active, codec_id="
+                      << vstream->codecpar->codec_id
+                      << " (no extradata in stream — will rely on in-band SPS/PPS)\n";
+        }
+    }

    /* Stream для D2D copies */
    cudaStream_t stream;
@@ -243,7 +288,6 @@ int main(int argc, char **argv) {
    AVFrame *frame = av_frame_alloc();
    if (!pkt || !frame) return 2;

-    int pool_idx = 0;
    uint64_t frame_count = 0;
    auto t_last_log = std::chrono::steady_clock::now();
    uint64_t last_log_count = 0;
@@ -279,6 +323,29 @@ int main(int argc, char **argv) {
            continue;
        }

+        /* v0.2 — публикуем encoded packet в packet ring ДО decoder. Это позволяет
+         * record-consumer'ам брать packet без второго RTSP-подключения к камере. */
+        if (a.enable_packet_ring) {
+            int64_t pkt_pts_ns = (pkt->pts != AV_NOPTS_VALUE)
+                ? av_rescale_q(pkt->pts, stream_tb, AVRational{1, 1000000000})
+                : cuframes::now_ns();
+            int64_t pkt_dts_ns = (pkt->dts != AV_NOPTS_VALUE)
+                ? av_rescale_q(pkt->dts, stream_tb, AVRational{1, 1000000000})
+                : pkt_pts_ns;
+            uint32_t pkt_flags = 0;
+            if (pkt->flags & AV_PKT_FLAG_KEY)        pkt_flags |= CUFRAMES_PKT_FLAG_KEY;
+            if (pkt->flags & AV_PKT_FLAG_CORRUPT)    pkt_flags |= CUFRAMES_PKT_FLAG_CORRUPT;
+#ifdef AV_PKT_FLAG_DISCONTINUITY
+            if (pkt->flags & AV_PKT_FLAG_DISCONTINUITY) pkt_flags |= CUFRAMES_PKT_FLAG_DISCONTINUITY;
+#endif
+            int prr = pub.publish_packet(pkt->data, (size_t)pkt->size,
+                                          pkt_pts_ns, pkt_dts_ns, pkt_flags);
+            if (prr != CUFRAMES_OK && a.verbose) {
+                std::cerr << "[cuframes-src] publish_packet rc=" << prr
+                          << " size=" << pkt->size << "\n";
+            }
+        }
+
        r = avcodec_send_packet(ctx, pkt);
        av_packet_unref(pkt);
        if (r < 0) continue;
@@ -302,7 +369,15 @@ int main(int argc, char **argv) {
            int src_pitch_y = frame->linesize[0];
            int src_pitch_uv = frame->linesize[1];

-            void *dst = pool[pool_idx];
+            /* v0.4: acquire slot из publisher's VMM pool */
+            void *dst = nullptr;
+            try {
+                dst = pub.acquire();
+            } catch (const cuframes::Error &e) {
+                std::cerr << "acquire: " << e.what() << "\n";
+                av_frame_unref(frame);
+                continue;
+            }

            /* D2D 2D-copy Y plane */
            cudaError_t cerr = cudaMemcpy2DAsync(
@@ -340,14 +415,13 @@ int main(int argc, char **argv) {

            int64_t pts_ns = cuframes::now_ns();
            try {
-                pub.publish_external(dst, stream, pts_ns);
+                pub.publish(stream, pts_ns);
            } catch (const cuframes::Error &e) {
-                std::cerr << "publish_external: " << e.what() << "\n";
+                std::cerr << "publish: " << e.what() << "\n";
                av_frame_unref(frame);
                continue;
            }

-            pool_idx = (pool_idx + 1) % a.ring_size;
            frame_count++;
            av_frame_unref(frame);

@@ -374,9 +448,7 @@ int main(int argc, char **argv) {
    av_buffer_unref(&hw_device);

    cudaStreamDestroy(stream);
-    /* Publisher destructor freed first; теперь освободим pool */
-    /* Note: publisher уже destroyed by RAII, IPC handles closed by subscribers */
-    for (auto p : pool) if (p) cudaFree(p);
+    /* v0.4: publisher owns VMM pool — destructor освободит cuMemRelease etc. */

    return 0;
 }