v0.4: VMM + POSIX FD — namespace decoupling (no pid share required)

Заменяет cudaMalloc + cudaIpcGetMemHandle на cuMemCreate (VMM) + cuMemExportToShareableHandle(POSIX_FILE_DESCRIPTOR). FDs передаются consumer'у через sendmsg(SCM_RIGHTS) в handshake. Frigate (s6-overlay не даёт share PID) и любой другой consumer работают БЕЗ pid namespace share — только volume mount unix socket'a /run/cuframes и IPC share для /dev/shm header. Sync: cudaEventRecord+IPC events → cuStreamSynchronize в do_publish. Producer ждёт ~1 ms что stream flush'нулся, потом atomic_store(seq). Consumer читает seq через memory_order_acquire и копирует DtoD без event wait — HW coherence гарантирована на одном GPU. ABI break (согласован с user'ом): - magic 0xCC7C1DCC → 0xCC7C1DCE (старые consumers fail cleanly) - protocol V3 → V4 - libcuframes.so.0 SOVERSION остаётся, но .so.0.3.0 → .so.0.4.0 - EXTERNAL ownership убран (VMM требует cuMemCreate-allocated memory, нельзя export'нуть произвольный cudaMalloc-pointer как POSIX FD) - cuframes-rtsp-source переведён на LIBRARY mode + один D2D memcpy в acquire'нутый slot (overhead малый — публишер всё равно делал такой D2D из FFmpeg hwframe pool в EXTERNAL pool раньше) Размер: granularity 2 MB на 5090 → NV12 1920×1080 (~3.1 MB) округляется до 4 MB, +1 MB на slot × 16 × 4 камеры = +64 MB VRAM. Терпимо. Packet ring (cuframes_packets://) НЕ затронут — отдельный SHM с своим magic, работает как раньше. PoC + smoke в spike/: - vmm_fd_pingpong/ — minimal cuMemCreate+FD round-trip - smoke_v04/ — full publisher+subscriber, 100/100 frames без pid share Base image: Dockerfile.runtime → CUDA 12.4 (был 13.0). Matching prod pipeline + Frigate base, иначе libcudart conflict при load. Compose stack (localhost-infra repo) — параллельный commit: - убран pid: container:cuframes-pub-parking из subscribers - image теги: gx/cuframes:0.4, gx/cuda-grid-pipeline:phase8, gx/frigate:cuframes-v0.4 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-25 20:13:31 +01:00
parent d646f5a4e4
commit 4862247fe2
18 changed files with 946 additions and 391 deletions
@@ -231,21 +231,16 @@ int main(int argc, char **argv) {
        return 2;
    }

-    /* Pre-allocate cuframes pool (NV12 — что nvdec выдаёт) */
+    /* Pre-allocate cuframes pool (NV12 — что nvdec выдаёт).
+     * v0.4: publisher сам аллоцирует через cuMemCreate (VMM). Раньше tool
+     * передавал external pool, но v0.4 не может export'нуть cudaMalloc-pointers
+     * как POSIX FD — VMM API требует cuMemCreate-allocated memory. */
    int32_t pitch_y = 0, pitch_uv = 0;
    size_t frame_size = cuframes::calc_frame_size(CUFRAMES_FORMAT_NV12,
                                                    width, height,
                                                    &pitch_y, &pitch_uv);

    cudaSetDevice(a.cuda_device);
-    std::vector<void *> pool(a.ring_size, nullptr);
-    for (int i = 0; i < a.ring_size; ++i) {
-        cudaError_t cerr = cudaMalloc(&pool[i], frame_size);
-        if (cerr != cudaSuccess) {
-            std::cerr << "cudaMalloc pool[" << i << "]: " << cudaGetErrorString(cerr) << "\n";
-            return 2;
-        }
-    }

    cuframes::PublisherOptions po;
    po.key = a.key;
@@ -257,12 +252,12 @@ int main(int argc, char **argv) {
        : CUFRAMES_POLICY_DROP_OLDEST;
    po.consumer_ack_timeout_ms = a.ack_timeout_ms;
    po.cuda_device = a.cuda_device;
-    po.ring_size = a.ring_size;  /* для logging */
+    po.ring_size = a.ring_size;

-    cuframes::Publisher pub(po, pool.data(), a.ring_size, frame_size);
+    cuframes::Publisher pub(po);   /* LIBRARY ownership — publisher owns VMM pool */
    std::cerr << "[cuframes-src] publisher 'cuframes-" << a.key
-              << "' ready, ring=" << a.ring_size
-              << " pool_size=" << frame_size << " bytes/frame\n";
+              << "' ready (v0.4 VMM), ring=" << a.ring_size
+              << " frame_size=" << frame_size << " bytes\n";

    /* v0.2 — encoded packet ring (опционально). */
    if (a.enable_packet_ring) {
@@ -293,7 +288,6 @@ int main(int argc, char **argv) {
    AVFrame *frame = av_frame_alloc();
    if (!pkt || !frame) return 2;

-    int pool_idx = 0;
    uint64_t frame_count = 0;
    auto t_last_log = std::chrono::steady_clock::now();
    uint64_t last_log_count = 0;
@@ -375,7 +369,15 @@ int main(int argc, char **argv) {
            int src_pitch_y = frame->linesize[0];
            int src_pitch_uv = frame->linesize[1];

-            void *dst = pool[pool_idx];
+            /* v0.4: acquire slot из publisher's VMM pool */
+            void *dst = nullptr;
+            try {
+                dst = pub.acquire();
+            } catch (const cuframes::Error &e) {
+                std::cerr << "acquire: " << e.what() << "\n";
+                av_frame_unref(frame);
+                continue;
+            }

            /* D2D 2D-copy Y plane */
            cudaError_t cerr = cudaMemcpy2DAsync(
@@ -413,14 +415,13 @@ int main(int argc, char **argv) {

            int64_t pts_ns = cuframes::now_ns();
            try {
-                pub.publish_external(dst, stream, pts_ns);
+                pub.publish(stream, pts_ns);
            } catch (const cuframes::Error &e) {
-                std::cerr << "publish_external: " << e.what() << "\n";
+                std::cerr << "publish: " << e.what() << "\n";
                av_frame_unref(frame);
                continue;
            }

-            pool_idx = (pool_idx + 1) % a.ring_size;
            frame_count++;
            av_frame_unref(frame);

@@ -447,9 +448,7 @@ int main(int argc, char **argv) {
    av_buffer_unref(&hw_device);

    cudaStreamDestroy(stream);
-    /* Publisher destructor freed first; теперь освободим pool */
-    /* Note: publisher уже destroyed by RAII, IPC handles closed by subscribers */
-    for (auto p : pool) if (p) cudaFree(p);
+    /* v0.4: publisher owns VMM pool — destructor освободит cuMemRelease etc. */

    return 0;
 }