Phase 2: composer + libcugrid (N источников → 2x2 grid в NV12 буфер)

Multi-source композитор работает на 4K @ 25fps стабильно. Live-тест с 4 камерами (parking, back_yard, front_yard, gate_lpr): все 4 active, 350 кадров за 14с, 27.6 МБ H.264 файл, кадр декодируется ffmpeg'ом с корректным 2x2 layout'ом. Содержимое: - include/cuframes_composer/cugrid.h — публичный API libcugrid: cfc_cugrid_fill_nv12 (region fill с alpha blend), cfc_cugrid_resize_nv12 (bilinear scale в rect). - src/cugrid/cugrid.cu — извлечённые из vf_cuda_grid kernel'ы (Y+UV fill + bilinear resize), объединены с C launcher'ом в одном .cu файле, под LGPL-2.1+. - include/cuframes_composer/composer.h — публичный API композитора: cfc_composer_cell_t для layout, get_health для observability. - src/composer.c — manager N cfc_source_t + единый NV12 output buffer (cuMemAlloc, переиспользуется на каждом compose'е). compose_clear fillит фон BT.709-чёрным, compose_cell делает resize ACTIVE источника или оставляет blackout для DEAD/STALE/CONNECTING. - examples/grid_record — Phase 2 smoke test: N --cell ключ,x,y,w,h → grid composer → NVENC → file. Сборка: добавлен LANGUAGES CUDA и CMAKE_CUDA_ARCHITECTURES 89;120 (Ada + Blackwell). Compile options раздельные для C и CUDA (-Wpedantic не подходит для .cu). Phase 2 RTSP push отложен на отдельный commit — будет через pipe-out к локальному ffmpeg'у, который публикует в mediamtx (вариант утверждён в Q2 дизайн-документа).
2026-06-03 05:01:49 +01:00
parent eae902afb3
commit 1e2b5d4e16
8 changed files with 949 additions and 12 deletions
@@ -2,13 +2,20 @@ cmake_minimum_required(VERSION 3.20)
 project(cuframes-composer
    VERSION 0.1.0
    DESCRIPTION "Multi-source video grid composer на CUDA + NVENC + RTSP"
-    LANGUAGES C
+    LANGUAGES C CUDA
 )
 set(CMAKE_C_STANDARD 11)
 set(CMAKE_C_STANDARD_REQUIRED ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 # CUDA архитектуры. RTX 5090 = sm_120 (Blackwell consumer), но также 5090 не
 # поддерживает legacy sm_75, поэтому 89 (Ada Lovelace, RTX 4090) для совместимости
 # с тестовым окружением + 120 для прода.
 if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
    set(CMAKE_CUDA_ARCHITECTURES "89;120")
 endif()
 if(NOT CMAKE_BUILD_TYPE)
    set(CMAKE_BUILD_TYPE Release)
 endif()
@@ -10,3 +10,8 @@
 add_executable(simple_record simple_record.c)
 target_link_libraries(simple_record PRIVATE cuframes_composer_static)
 target_include_directories(simple_record PRIVATE ${CMAKE_SOURCE_DIR}/include)
 # Phase 2 — multi-source grid в файл.
 add_executable(grid_record grid_record.c)
 target_link_libraries(grid_record PRIVATE cuframes_composer_static)
 target_include_directories(grid_record PRIVATE ${CMAKE_SOURCE_DIR}/include)
@@ -0,0 +1,270 @@
 /* grid_record — Phase 2 smoke test.
 *
 * Подписывается на 4 cuframes-источника (4 камеры), композирует их в 2×2 grid
 * через cfc_composer, кодирует через NVENC, пишет H.264 в файл.
 *
 * Layout 2×2 1080p:
 *   Output: 3840×2160 (4K)
 *   Cells:  4 шт. 1920×1080 в углах
 *
 * Использование:
 *   grid_record --out 4k.h264 \
 *               --cell cam-parking,0,0,1920,1080 \
 *               --cell cam-back_yard,1920,0,1920,1080 \
 *               --cell cam-front_yard,0,1080,1920,1080 \
 *               --cell cam-gate_lpr,1920,1080,1920,1080 \
 *               --seconds 15
 *
 * Лицензия: LGPL-2.1+
 */
 #include "../include/cuframes_composer/composer.h"
 #include "../include/cuframes_composer/nvenc.h"
 #include <cuda.h>
 #include <errno.h>
 #include <getopt.h>
 #include <signal.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <time.h>
 #include <unistd.h>
 #define MAX_CELLS 16
 static volatile sig_atomic_t g_stop = 0;
 static void on_sig(int s) { (void)s; g_stop = 1; }
 typedef struct write_ctx {
    FILE *fp;
    uint64_t bytes_written;
    uint64_t frames_encoded;
    uint64_t idr_count;
 } write_ctx_t;
 static void on_bitstream(const uint8_t *bs, size_t size, int64_t pts_ns,
                          int is_idr, void *user)
 {
    (void)pts_ns;
    write_ctx_t *ctx = (write_ctx_t *)user;
    if (fwrite(bs, 1, size, ctx->fp) == size) {
        ctx->bytes_written += size;
        ctx->frames_encoded++;
        if (is_idr) ctx->idr_count++;
    }
 }
 static int parse_cell(const char *arg, cfc_composer_cell_t *out,
                       char *key_storage)
 {
    /* Формат: key,x,y,w,h */
    char buf[128];
    strncpy(buf, arg, sizeof(buf) - 1);
    buf[sizeof(buf) - 1] = '\0';
    char *tok = strtok(buf, ",");
    if (!tok) return -1;
    strncpy(key_storage, tok, 63);
    key_storage[63] = '\0';
    out->source_key = key_storage;
    tok = strtok(NULL, ","); if (!tok) return -1; out->x = atoi(tok);
    tok = strtok(NULL, ","); if (!tok) return -1; out->y = atoi(tok);
    tok = strtok(NULL, ","); if (!tok) return -1; out->w = atoi(tok);
    tok = strtok(NULL, ","); if (!tok) return -1; out->h = atoi(tok);
    return 0;
 }
 static const char *cu_err(CUresult r)
 {
    const char *s = NULL;
    cuGetErrorString(r, &s);
    return s ? s : "?";
 }
 int main(int argc, char **argv)
 {
    const char *out_path = NULL;
    int fps = 25, bitrate = 10000, max_seconds = 0;
    int out_w = 3840, out_h = 2160;
    cfc_composer_cell_t cells[MAX_CELLS] = { 0 };
    static char cell_keys[MAX_CELLS][64];
    int num_cells = 0;
    static struct option opts[] = {
        {"out",     required_argument, 0, 'o'},
        {"cell",    required_argument, 0, 'c'},
        {"fps",     required_argument, 0, 'f'},
        {"bitrate", required_argument, 0, 'b'},
        {"width",   required_argument, 0, 'W'},
        {"height",  required_argument, 0, 'H'},
        {"seconds", required_argument, 0, 's'},
        {0, 0, 0, 0},
    };
    int c;
    while ((c = getopt_long(argc, argv, "o:c:f:b:W:H:s:", opts, NULL)) != -1) {
        switch (c) {
        case 'o': out_path = optarg; break;
        case 'c':
            if (num_cells >= MAX_CELLS) {
                fprintf(stderr, "max %d cells\n", MAX_CELLS);
                return 1;
            }
            if (parse_cell(optarg, &cells[num_cells], cell_keys[num_cells]) != 0) {
                fprintf(stderr, "invalid --cell '%s' (key,x,y,w,h)\n", optarg);
                return 1;
            }
            num_cells++;
            break;
        case 'f': fps = atoi(optarg); break;
        case 'b': bitrate = atoi(optarg); break;
        case 'W': out_w = atoi(optarg); break;
        case 'H': out_h = atoi(optarg); break;
        case 's': max_seconds = atoi(optarg); break;
        default: return 1;
        }
    }
    if (!out_path || num_cells == 0) {
        fprintf(stderr,
            "Использование: %s --out <file.h264> --cell key,x,y,w,h [--cell ...]\n"
            "                [--width 3840] [--height 2160] [--fps 25]\n"
            "                [--bitrate 10000] [--seconds N]\n",
            argv[0]);
        return 1;
    }
    signal(SIGINT, on_sig);
    signal(SIGTERM, on_sig);
    /* CUDA primary context. */
    CUresult cr = cuInit(0);
    if (cr != CUDA_SUCCESS) { fprintf(stderr, "cuInit: %s\n", cu_err(cr)); return 1; }
    CUdevice dev;
    cuDeviceGet(&dev, 0);
    CUcontext ctx;
    cuDevicePrimaryCtxRetain(&ctx, dev);
    cuCtxPushCurrent(ctx);
    /* Composer. */
    cfc_composer_config_t ccfg = {
        .width = out_w,
        .height = out_h,
        .cells = cells,
        .num_cells = num_cells,
        .cuda_device = 0,
    };
    cfc_composer_t *comp = NULL;
    if (cfc_composer_create(&ccfg, &comp) != 0) {
        fprintf(stderr, "cfc_composer_create failed\n");
        return 1;
    }
    fprintf(stderr, "[grid_record] composer %dx%d, %d ячеек\n",
            out_w, out_h, num_cells);
    /* Encoder. */
    cfc_encoder_config_t ecfg = {
        .cuda_ctx = ctx,
        .width = out_w,
        .height = out_h,
        .fps_num = fps,
        .fps_den = 1,
        .bitrate_kbps = bitrate,
        .gop_size = fps,
        .num_b_frames = 0,
        .preset = "ll",
    };
    cfc_encoder_t *enc = NULL;
    if (cfc_encoder_create(&ecfg, &enc) != 0) {
        fprintf(stderr, "cfc_encoder_create failed\n");
        cfc_composer_destroy(comp);
        return 1;
    }
    /* Output file. */
    write_ctx_t wctx = { 0 };
    wctx.fp = fopen(out_path, "wb");
    if (!wctx.fp) {
        fprintf(stderr, "fopen(%s): %s\n", out_path, strerror(errno));
        cfc_encoder_destroy(enc);
        cfc_composer_destroy(comp);
        return 1;
    }
    fprintf(stderr, "[grid_record] начало записи в %s (Ctrl+C для остановки)\n",
            out_path);
    /* Main loop — frame cadence по wall clock'у. */
    struct timespec ts_start;
    clock_gettime(CLOCK_MONOTONIC, &ts_start);
    int64_t start_us = (int64_t)ts_start.tv_sec * 1000000 + ts_start.tv_nsec / 1000;
    int64_t frame_us = 1000000 / fps;
    int64_t next_us = start_us;
    while (!g_stop) {
        struct timespec now;
        clock_gettime(CLOCK_MONOTONIC, &now);
        int64_t now_us = (int64_t)now.tv_sec * 1000000 + now.tv_nsec / 1000;
        if (now_us < next_us) {
            int64_t sleep_us = next_us - now_us;
            if (sleep_us > 1000000) sleep_us = 1000000;
            struct timespec ts = {
                .tv_sec = sleep_us / 1000000,
                .tv_nsec = (sleep_us % 1000000) * 1000,
            };
            nanosleep(&ts, NULL);
            continue;
        }
        next_us += frame_us;
        CUdeviceptr out_y = 0;
        int out_pitch = 0, oW = 0, oH = 0;
        if (cfc_composer_compose(comp, &out_y, &out_pitch, &oW, &oH) != 0) {
            fprintf(stderr, "[grid_record] compose failed\n");
            break;
        }
        int64_t pts_ns = (now_us - start_us) * 1000;
        if (cfc_encoder_encode_frame(enc, out_y, out_pitch, pts_ns,
                                      on_bitstream, &wctx) != 0) {
            fprintf(stderr, "[grid_record] encode failed\n");
            break;
        }
        if (wctx.frames_encoded > 0 && wctx.frames_encoded % 50 == 0) {
            double elapsed = (now_us - start_us) / 1e6;
            cfc_composer_health_t h;
            cfc_composer_get_health(comp, &h);
            fprintf(stderr,
                "[grid_record] %llu кадров, %llu IDR, %.1f МБ за %.1fс (%.1f fps) | "
                "src active=%d stale=%d dead=%d\n",
                (unsigned long long)wctx.frames_encoded,
                (unsigned long long)wctx.idr_count,
                wctx.bytes_written / 1048576.0,
                elapsed,
                wctx.frames_encoded / elapsed,
                h.active, h.stale, h.dead);
        }
        if (max_seconds > 0 && (now_us - start_us) / 1000000 >= max_seconds) {
            fprintf(stderr, "[grid_record] лимит %dс\n", max_seconds);
            break;
        }
    }
    fprintf(stderr, "[grid_record] flush encoder\n");
    cfc_encoder_flush(enc, on_bitstream, &wctx);
    fprintf(stderr,
        "[grid_record] итого: %llu кадров, %llu IDR, %.2f МБ\n",
        (unsigned long long)wctx.frames_encoded,
        (unsigned long long)wctx.idr_count,
        wctx.bytes_written / 1048576.0);
    fclose(wctx.fp);
    cfc_encoder_destroy(enc);
    cfc_composer_destroy(comp);
    cuCtxPopCurrent(NULL);
    cuDevicePrimaryCtxRelease(dev);
    return 0;
 }
@@ -0,0 +1,97 @@
 /* cuframes-composer — высокоуровневый композитор N источников в grid.
 *
 * Управляет:
 *   - N cfc_source (фоновые подписки на cuframes-publisher'ы)
 *   - Layout (cell positions + size)
 *   - Output NV12 buffer (cuMemAlloc, переиспользуется)
 *   - Композиция на каждом тике: clear background + resize/blit для каждого ACTIVE,
 *     fill чёрным для DEAD/STALE.
 *
 * Phase 2 архитектура:
 *   Композитор работает синхронно: caller вызывает cfc_composer_compose() →
 *   композитор для каждого источника берёт snapshot и пишет в output. После
 *   вызова output NV12 buffer готов к encode.
 *
 * Каноническая интеграция:
 *   for (;;) {
 *       cfc_composer_compose(comp);          // grid готов в output buffer
 *       cfc_encoder_encode_frame(enc, ...);  // → H.264
 *   }
 *
 * Лицензия: LGPL-2.1+
 */
 #ifndef CUFRAMES_COMPOSER_COMPOSER_H
 #define CUFRAMES_COMPOSER_COMPOSER_H
 #include "source.h"
 #include <cuda.h>
 #include <stdint.h>
 #ifdef __cplusplus
 extern "C" {
 #endif
 /* Описание одной ячейки grid'а — где на output буфере рисуется источник.
 * Все координаты в full-res пикселях, должны быть чётными. */
 typedef struct cfc_composer_cell {
    const char *source_key;           /* cuframes key, например "cam-parking" */
    int x, y;                         /* top-left угол на output буфере */
    int w, h;                         /* размер ячейки */
 } cfc_composer_cell_t;
 typedef struct cfc_composer_config {
    /* Output буфер (одно фиксированное разрешение для всего grid'а). */
    int width;                        /* output ширина (например 3840 для 2×2 1080p) */
    int height;                       /* output высота (например 2160 для 2×2 1080p) */
    /* Cells конфигурация. Массив указателей на cells (для удобства user'а). */
    const cfc_composer_cell_t *cells; /* array, не копируется — caller держит */
    int num_cells;
    /* CUDA устройство. */
    int cuda_device;                  /* индекс, обычно 0 */
    /* Backgound цвет (BT.709 limited): Y=16/U=128/V=128 = чёрный. */
    int bg_y, bg_u, bg_v;
    /* Параметры stale/dead для источников. */
    int reconnect_min_ms;             /* default 1000 */
    int reconnect_max_ms;             /* default 30000 */
    int stale_threshold_ms;           /* default 500 */
    int dead_threshold_ms;            /* default 5000 */
 } cfc_composer_config_t;
 typedef struct cfc_composer cfc_composer_t;
 /* Создать композитор. Выделяет output NV12 буфер, запускает N source thread'ов. */
 int cfc_composer_create(const cfc_composer_config_t *cfg, cfc_composer_t **out);
 /* Скомпоновать один кадр. Вернёт указатели на output NV12 (Y + UV) и pitch.
 * Указатели действительны до следующего compose. */
 int cfc_composer_compose(
    cfc_composer_t *comp,
    CUdeviceptr *out_y_ptr,
    int *out_pitch_y,
    int *out_width,
    int *out_height
 );
 /* Получить layout статистику по источникам — для debug / health-репортов. */
 typedef struct cfc_composer_health {
    int total;                        /* всего источников */
    int active;                       /* в состоянии ACTIVE */
    int stale;                        /* в STALE */
    int dead;                         /* DEAD/DISCONNECTED/CONNECTING */
 } cfc_composer_health_t;
 int cfc_composer_get_health(cfc_composer_t *comp, cfc_composer_health_t *out);
 /* Уничтожить композитор. */
 int cfc_composer_destroy(cfc_composer_t *comp);
 #ifdef __cplusplus
 }
 #endif
 #endif /* CUFRAMES_COMPOSER_COMPOSER_H */
@@ -0,0 +1,81 @@
 /* cuframes-composer — CUDA kernels для grid-композиции NV12 кадров.
 *
 * Извлечено из vf_cuda_grid (FFmpeg out-of-tree filter), под LGPL-2.1+.
 * Все операции работают на NV12-кадрах:
 *   - Y plane: full resolution, 1 byte per pixel
 *   - UV plane: half resolution, 2 bytes per pixel (interleaved U,V)
 *
 * Все операции:
 *   - Принимают CUstream (NULL = default stream)
 *   - Не аллоцируют память (caller выделяет dst буфер заранее)
 *   - Idempotent для re-use (composer вызывает на каждом кадре)
 *
 * Lifecycle:
 *   cfc_cugrid_init() один раз при старте композитора (no-op для CUDA, но
 *                     резервирован под cuModuleLoad если перейдём на PTX)
 *
 * Лицензия: LGPL-2.1+
 */
 #ifndef CUFRAMES_COMPOSER_CUGRID_H
 #define CUFRAMES_COMPOSER_CUGRID_H
 #include <cuda.h>
 #include <stdint.h>
 #ifdef __cplusplus
 extern "C" {
 #endif
 /* Lazy init (no-op в текущей реализации; зарезервировано под future PTX module). */
 int cfc_cugrid_init(void);
 /* Заполнить прямоугольник на NV12 кадре solid color'ом с alpha-блендингом.
 *
 *   dst_y, dst_uv     — указатели на Y и UV plane соответственно.
 *   pitch_y, pitch_uv — pitch в bytes (для NV12 обычно равны и кратны 256).
 *   x, y, w, h        — прямоугольник в full-res пикселях (chroma usually пишется
 *                       автоматически на половинном rastr'е).
 *                       w и h должны быть чётными (требование 4:2:0 subsampling'а).
 *   color_y/u/v       — компоненты цвета в BT.709 limited range (Y: 16-235, UV: 16-240).
 *                       Для чёрного: Y=16, U=128, V=128.
 *   alpha             — 0..255 (0 = transparent, 255 = opaque).
 *
 * Возвращает 0 при успехе.
 */
 int cfc_cugrid_fill_nv12(
    CUstream stream,
    CUdeviceptr dst_y, int pitch_y,
    CUdeviceptr dst_uv, int pitch_uv,
    int x, int y, int w, int h,
    int color_y, int color_u, int color_v, int alpha
 );
 /* Resize NV12 src → rect (dst_x, dst_y, dst_w, dst_h) на NV12 dst.
 *
 * Bilinear interpolation на Y и UV. Y и UV plane src'а должны быть одним
 * allocation'ом (NV12 layout) или хотя бы иметь корректные указатели каждый.
 *
 *   src_y, src_uv      — src указатели.
 *   src_w, src_h       — размер src в full-res пикселях.
 *   src_pitch_y/uv     — pitch'ы (для NV12 cuframes pitch_y == pitch_uv).
 *   dst_y, dst_uv      — dst указатели.
 *   dst_pitch_y/uv     — pitch'ы dst NV12 кадра.
 *   dst_x, dst_y_off,  — координата top-left прямоугольника на dst (full-res пиксели).
 *   dst_w, dst_h         dst_x и dst_y_off, dst_w, dst_h ДОЛЖНЫ быть чётными
 *                       (для UV/chroma subsampling'а).
 */
 int cfc_cugrid_resize_nv12(
    CUstream stream,
    CUdeviceptr src_y, int src_w, int src_h, int src_pitch_y,
    CUdeviceptr src_uv,                       int src_pitch_uv,
    CUdeviceptr dst_y, int dst_pitch_y,
    CUdeviceptr dst_uv, int dst_pitch_uv,
    int dst_x, int dst_y_off, int dst_w, int dst_h
 );
 #ifdef __cplusplus
 }
 #endif
 #endif /* CUFRAMES_COMPOSER_CUGRID_H */
@@ -1,22 +1,27 @@
 # Основная библиотека композитора — `libcuframes_composer.so`. Содержит:
-#   - source.c       подписка к cuframes публишеру + state machine
+#   - source.c          подписка к cuframes публишеру + state machine
-#   - nvenc_loader.c dlopen libnvidia-encode.so + загрузка API таблицы
+#   - nvenc_loader.c    dlopen libnvidia-encode.so + загрузка API таблицы
-#   - nvenc.c        обвязка вокруг NVENC SDK (init/encode/teardown)
+#   - nvenc.c           обвязка вокруг NVENC SDK (init/encode/teardown)
 #   - composer.c        Phase 2: multi-source grid композитор
 #   - cugrid/cugrid.cu  Phase 2: CUDA kernels (resize, fill) для NV12 grid'а
 #
 # Дальше по фазам:
-#   Phase 2: compose.c (CUDA composition), ringbuf.c (SPSC swap)
+#   Phase 3: overlay.c, png_decode.c, text_render.c, rtsp_publisher.c, rtp_h264.c
 #   Phase 3: rtsp_publisher.c, rtp_h264.c, overlay.c, png_decode.c, text_render.c
 #   Phase 4: control_zmq.c
 #   Phase 5: health_mqtt.c
-set(COMPOSER_SOURCES
+set(COMPOSER_SOURCES_C
    source.c
    nvenc_loader.c
    nvenc.c
    composer.c
 )
 set(COMPOSER_SOURCES_CU
    cugrid/cugrid.cu
 )
-add_library(cuframes_composer SHARED ${COMPOSER_SOURCES})
+add_library(cuframes_composer SHARED ${COMPOSER_SOURCES_C} ${COMPOSER_SOURCES_CU})
-add_library(cuframes_composer_static STATIC ${COMPOSER_SOURCES})
+add_library(cuframes_composer_static STATIC ${COMPOSER_SOURCES_C} ${COMPOSER_SOURCES_CU})
 foreach(target cuframes_composer cuframes_composer_static)
    target_include_directories(${target}
@@ -28,10 +33,15 @@ foreach(target cuframes_composer cuframes_composer_static)
            ${NVCODEC_HEADERS_DIR}
    )
    target_compile_features(${target} PRIVATE c_std_11)
    # C-only флаги (для CUDA свои дефолты, -Wpedantic не подходит для .cu).
    target_compile_options(${target} PRIVATE
-        -Wall -Wextra -Wpedantic
+        $<$<COMPILE_LANGUAGE:C>:-Wall>
-        $<$<CONFIG:Debug>:-O0 -g3>
+        $<$<COMPILE_LANGUAGE:C>:-Wextra>
-        $<$<CONFIG:Release>:-O2 -g>
+        $<$<COMPILE_LANGUAGE:C>:-Wpedantic>
        $<$<AND:$<COMPILE_LANGUAGE:C>,$<CONFIG:Debug>>:-O0>
        $<$<AND:$<COMPILE_LANGUAGE:C>,$<CONFIG:Debug>>:-g3>
        $<$<AND:$<COMPILE_LANGUAGE:C>,$<CONFIG:Release>>:-O2>
        $<$<AND:$<COMPILE_LANGUAGE:C>,$<CONFIG:Release>>:-g>
    )
    target_link_libraries(${target}
        PUBLIC
@@ -42,6 +52,10 @@ foreach(target cuframes_composer cuframes_composer_static)
            ${LIBDL_LIBRARY}           # для dlopen libnvidia-encode.so
            rt
    )
    # CUDA properties.
    set_target_properties(${target} PROPERTIES
        CUDA_SEPARABLE_COMPILATION OFF
    )
 endforeach()
 set_target_properties(cuframes_composer PROPERTIES
@@ -0,0 +1,270 @@
 /* Реализация cfc_composer_t — multi-source grid композитор.
 *
 * Owns:
 *   - N cfc_source_t (по одному на ячейку grid'а)
 *   - один NV12 output buffer (cuMemAlloc — staging для NVENC encoder'а)
 *   - statistics для health-репортов
 *
 * Compose-цикл:
 *   1) cuMemsetD8 → быстрое черный fill всего Y plane (16=BT.709 black)
 *      + UV plane заполняется отдельно (128,128).
 *   2) Для каждой ячейки:
 *      a) get_latest snapshot.
 *      b) ACTIVE → cfc_cugrid_resize_nv12 (src VMM → dst rect)
 *      c) DEAD/STALE → cfc_cugrid_fill_nv12 чёрным с alpha=255 уже сделано,
 *         тут лучше визуально показать что источник упал, поэтому в Phase 3
 *         поверх blackout рисуется текст «NO SIGNAL» через overlay'и.
 *   3) cudaStreamSynchronize → output готов.
 *
 * Phase 2 упрощения:
 *   - Sync compose на default stream. Stream pipelining — Phase 3+.
 *   - Без double buffering. encode и compose делаются строго последовательно.
 *
 * Лицензия: LGPL-2.1+
 */
 #include "../include/cuframes_composer/composer.h"
 #include "../include/cuframes_composer/cugrid.h"
 #include <cuda_runtime.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <time.h>
 #define CFC_COMPOSER_MAX_CELLS 64
 struct cfc_composer {
    cfc_composer_config_t cfg;
    /* Копии cells (caller владеет original config'ом). source_key копируется
     * в персистентную строку чтобы cfc_source_t могла на неё указывать. */
    cfc_composer_cell_t cells[CFC_COMPOSER_MAX_CELLS];
    char cell_keys[CFC_COMPOSER_MAX_CELLS][64];
    int num_cells;
    /* Источники по индексу cell'а (или NULL если cell — статичная картинка). */
    cfc_source_t *sources[CFC_COMPOSER_MAX_CELLS];
    /* Output NV12 буфер: один contiguous allocation, Y plane (pitch * h) +
     * UV plane (pitch * h/2). Pitch выравнен на 256 байт. */
    CUdeviceptr output_ptr;
    int output_pitch_y;
    int output_pitch_uv;
    size_t output_size;
    /* CUDA stream для compose (Phase 2 — default stream = 0). */
    cudaStream_t stream;
 };
 /* ── Helpers ──────────────────────────────────────────────────────────── */
 static int round_up_pitch(int w)
 {
    return (w + 255) & ~255;
 }
 static void *cu_ptr(CUdeviceptr p) { return (void *)(uintptr_t)p; }
 /* ── Compose ──────────────────────────────────────────────────────────── */
 static int compose_clear(cfc_composer_t *comp)
 {
    /* Y plane → 16 (BT.709 black). */
    cudaError_t e = cudaMemsetAsync(
        cu_ptr(comp->output_ptr), comp->cfg.bg_y,
        (size_t)comp->output_pitch_y * comp->cfg.height,
        comp->stream);
    if (e != cudaSuccess) {
        fprintf(stderr, "[cfc/composer] Y memset failed: %s\n", cudaGetErrorString(e));
        return -1;
    }
    /* UV plane → нужны два значения (U=128, V=128), не один. Делаем fill
     * через тот же cfc_cugrid_fill_nv12 которым fillим ячейки. Прокидываем
     * alpha=255 чтобы перезатереть полностью. */
    CUdeviceptr uv = comp->output_ptr +
                     (size_t)comp->output_pitch_y * comp->cfg.height;
    /* Просто memset UV не подходит — там interleaved пары. Делаем fill_nv12
     * с alpha=255, тогда формула станет dst = fill * 255 / 255 = fill. */
    int rc = cfc_cugrid_fill_nv12(
        (CUstream)comp->stream,
        /* Y уже сделан выше — но fill_nv12 повторно fillит Y. Передаём
         * y_color = bg_y, alpha=255 — get тот же результат, минор waste.
         * Phase 2 acceptable, в Phase 3 разделим Y/UV fillы. */
        comp->output_ptr, comp->output_pitch_y,
        uv, comp->output_pitch_uv,
        0, 0, comp->cfg.width, comp->cfg.height,
        comp->cfg.bg_y, comp->cfg.bg_u, comp->cfg.bg_v, 255);
    return rc;
 }
 static int compose_cell(cfc_composer_t *comp, int idx)
 {
    const cfc_composer_cell_t *cell = &comp->cells[idx];
    cfc_source_t *src = comp->sources[idx];
    if (!src) return 0;
    cfc_source_snapshot_t snap;
    cfc_source_get_latest(src, &snap);
    if (snap.state != CFC_SOURCE_ACTIVE || snap.width <= 0) {
        /* DEAD/STALE/CONNECTING — оставляем чёрный (уже clear'нут).
         * Phase 3 добавит overlay «NO SIGNAL». */
        return 0;
    }
    CUdeviceptr uv = comp->output_ptr +
                     (size_t)comp->output_pitch_y * comp->cfg.height;
    /* Source NV12 layout: Y (pitch_y * height) + UV (pitch_y * height/2)
     * непрерывно. Указатель на UV plane = ptr + pitch_y * height. */
    CUdeviceptr src_uv = snap.ptr + (size_t)snap.pitch_y * snap.height;
    return cfc_cugrid_resize_nv12(
        (CUstream)comp->stream,
        snap.ptr, snap.width, snap.height, snap.pitch_y,
        src_uv, snap.pitch_uv,
        comp->output_ptr, comp->output_pitch_y,
        uv, comp->output_pitch_uv,
        cell->x, cell->y, cell->w, cell->h);
 }
 /* ── Public API ───────────────────────────────────────────────────────── */
 int cfc_composer_create(const cfc_composer_config_t *cfg, cfc_composer_t **out)
 {
    if (!cfg || !out) return -1;
    if (cfg->width <= 0 || cfg->height <= 0) return -1;
    if (cfg->num_cells <= 0 || cfg->num_cells > CFC_COMPOSER_MAX_CELLS) return -1;
    if (!cfg->cells) return -1;
    cfc_composer_t *comp = calloc(1, sizeof(*comp));
    if (!comp) return -1;
    comp->cfg = *cfg;
    comp->num_cells = cfg->num_cells;
    comp->stream = 0;  /* default stream Phase 2 */
    /* Дефолты для bg цвета (если caller не задал). */
    if (!comp->cfg.bg_y) comp->cfg.bg_y = 16;
    if (!comp->cfg.bg_u) comp->cfg.bg_u = 128;
    if (!comp->cfg.bg_v) comp->cfg.bg_v = 128;
    /* Сохраняем cells + копируем source_key в персистентное хранилище. */
    for (int i = 0; i < cfg->num_cells; i++) {
        comp->cells[i] = cfg->cells[i];
        if (cfg->cells[i].source_key) {
            strncpy(comp->cell_keys[i], cfg->cells[i].source_key,
                    sizeof(comp->cell_keys[i]) - 1);
            comp->cells[i].source_key = comp->cell_keys[i];
        }
    }
    /* Выделяем output NV12 буфер. */
    comp->output_pitch_y = round_up_pitch(cfg->width);
    comp->output_pitch_uv = comp->output_pitch_y;
    comp->output_size = (size_t)comp->output_pitch_y * cfg->height +
                        (size_t)comp->output_pitch_uv * (cfg->height / 2);
    CUresult cr = cuMemAlloc(&comp->output_ptr, comp->output_size);
    if (cr != CUDA_SUCCESS) {
        const char *es = NULL; cuGetErrorString(cr, &es);
        fprintf(stderr, "[cfc/composer] cuMemAlloc(%zu) failed: %s\n",
                comp->output_size, es ? es : "?");
        free(comp);
        return -1;
    }
    /* Создаём источники по одному на cell с source_key. */
    for (int i = 0; i < comp->num_cells; i++) {
        if (!comp->cells[i].source_key) continue;
        char name[32];
        snprintf(name, sizeof(name), "composer-%d", i);
        cfc_source_config_t scfg = {
            .key = comp->cells[i].source_key,
            .consumer_name = name,
            .cuda_device = cfg->cuda_device,
            .reconnect_min_ms = cfg->reconnect_min_ms,
            .reconnect_max_ms = cfg->reconnect_max_ms,
            .stale_threshold_ms = cfg->stale_threshold_ms,
            .dead_threshold_ms = cfg->dead_threshold_ms,
        };
        if (cfc_source_create(&scfg, &comp->sources[i]) != 0) {
            fprintf(stderr,
                    "[cfc/composer] cfc_source_create failed для cell %d (%s)\n",
                    i, comp->cells[i].source_key);
            /* Не fatal — DEAD source просто будет показывать blackout.
             * Ячейка остаётся NULL → compose_cell сразу выйдет. */
        }
    }
    cfc_cugrid_init();
    *out = comp;
    return 0;
 }
 int cfc_composer_compose(cfc_composer_t *comp,
                          CUdeviceptr *out_y_ptr,
                          int *out_pitch_y,
                          int *out_width,
                          int *out_height)
 {
    if (!comp) return -1;
    if (compose_clear(comp) != 0) return -1;
    for (int i = 0; i < comp->num_cells; i++) {
        if (compose_cell(comp, i) != 0) {
            fprintf(stderr, "[cfc/composer] compose_cell %d failed\n", i);
            /* Не fatal — продолжаем с остальными ячейками. */
        }
    }
    cudaError_t e = cudaStreamSynchronize(comp->stream);
    if (e != cudaSuccess) {
        fprintf(stderr, "[cfc/composer] stream sync failed: %s\n",
                cudaGetErrorString(e));
        return -1;
    }
    if (out_y_ptr) *out_y_ptr = comp->output_ptr;
    if (out_pitch_y) *out_pitch_y = comp->output_pitch_y;
    if (out_width) *out_width = comp->cfg.width;
    if (out_height) *out_height = comp->cfg.height;
    return 0;
 }
 int cfc_composer_get_health(cfc_composer_t *comp, cfc_composer_health_t *out)
 {
    if (!comp || !out) return -1;
    memset(out, 0, sizeof(*out));
    out->total = comp->num_cells;
    for (int i = 0; i < comp->num_cells; i++) {
        if (!comp->sources[i]) {
            out->dead++;
            continue;
        }
        cfc_source_snapshot_t snap;
        cfc_source_get_latest(comp->sources[i], &snap);
        switch (snap.state) {
        case CFC_SOURCE_ACTIVE: out->active++; break;
        case CFC_SOURCE_STALE:  out->stale++; break;
        default:                out->dead++; break;
        }
    }
    return 0;
 }
 int cfc_composer_destroy(cfc_composer_t *comp)
 {
    if (!comp) return 0;
    for (int i = 0; i < comp->num_cells; i++) {
        if (comp->sources[i]) cfc_source_destroy(comp->sources[i]);
    }
    if (comp->output_ptr) cuMemFree(comp->output_ptr);
    free(comp);
    return 0;
 }
@@ -0,0 +1,193 @@
 /* CUDA kernels + C launcher для grid-композиции NV12 кадров.
 *
 * Извлечено из vf_cuda_grid (FFmpeg out-of-tree filter), LGPL-2.1+.
 *
 * Все операции работают на NV12 кадрах:
 *   Y plane:  1 byte/pixel
 *   UV plane: 2 bytes/UV-pair (interleaved U,V), pitch обычно == pitch_y
 *
 * Стандартный block size: 16×16 = 256 threads, кратно warp 32.
 *
 * Lifecycle:
 *   cfc_cugrid_init() один раз при старте композитора (no-op для CUDA C++
 *   runtime kernels, зарезервировано на cuModuleLoad если перейдём на PTX)
 *
 * Лицензия: LGPL-2.1+
 */
 #include "../../include/cuframes_composer/cugrid.h"
 #include <cuda_runtime.h>
 #include <stdio.h>
 #define CFC_BLOCK_X 16
 #define CFC_BLOCK_Y 16
 #define CHECK_CUDA(call) do { \
    cudaError_t _e = (call); \
    if (_e != cudaSuccess) { \
        fprintf(stderr, "[cfc/cugrid] %s:%d %s → %s\n", \
                __FILE__, __LINE__, #call, cudaGetErrorString(_e)); \
        return -1; \
    } \
 } while (0)
 /* ── Kernels ──────────────────────────────────────────────────────────── */
 __global__ static void cfc_kern_fill_y(unsigned char *dst, int dst_pitch,
                                         int rx, int ry, int rw, int rh,
                                         int fill, int alpha)
 {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if (x >= rw || y >= rh) return;
    unsigned char *p = dst + (ry + y) * dst_pitch + (rx + x);
    int cur = *p;
    *p = (unsigned char)((fill * alpha + cur * (255 - alpha)) / 255);
 }
 __global__ static void cfc_kern_fill_uv(unsigned char *dst, int dst_pitch,
                                          int rx, int ry, int rw, int rh,
                                          int fill_u, int fill_v, int alpha)
 {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if (x >= rw || y >= rh) return;
    unsigned char *p = dst + (ry + y) * dst_pitch + (rx + x) * 2;
    int cu = p[0], cv = p[1];
    p[0] = (unsigned char)((fill_u * alpha + cu * (255 - alpha)) / 255);
    p[1] = (unsigned char)((fill_v * alpha + cv * (255 - alpha)) / 255);
 }
 __global__ static void cfc_kern_resize_y(
    const unsigned char *src, int src_w, int src_h, int src_pitch,
    unsigned char *dst, int dst_x, int dst_y, int dst_w, int dst_h, int dst_pitch)
 {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if (x >= dst_w || y >= dst_h) return;
    float fx = ((float)x + 0.5f) * src_w / dst_w - 0.5f;
    float fy = ((float)y + 0.5f) * src_h / dst_h - 0.5f;
    int x0 = max(0, (int)floorf(fx)), x1 = min(src_w - 1, x0 + 1);
    int y0 = max(0, (int)floorf(fy)), y1 = min(src_h - 1, y0 + 1);
    float wx = fx - x0, wy = fy - y0;
    float p00 = (float)src[y0 * src_pitch + x0];
    float p01 = (float)src[y0 * src_pitch + x1];
    float p10 = (float)src[y1 * src_pitch + x0];
    float p11 = (float)src[y1 * src_pitch + x1];
    float v = (1 - wx) * (1 - wy) * p00 + wx * (1 - wy) * p01
            + (1 - wx) * wy * p10 + wx * wy * p11;
    dst[(dst_y + y) * dst_pitch + (dst_x + x)] = (unsigned char)v;
 }
 __global__ static void cfc_kern_resize_uv(
    const unsigned char *src, int src_w, int src_h, int src_pitch,
    unsigned char *dst, int dst_x, int dst_y, int dst_w, int dst_h, int dst_pitch)
 {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if (x >= dst_w || y >= dst_h) return;
    float fx = ((float)x + 0.5f) * src_w / dst_w - 0.5f;
    float fy = ((float)y + 0.5f) * src_h / dst_h - 0.5f;
    int x0 = max(0, (int)floorf(fx)), x1 = min(src_w - 1, x0 + 1);
    int y0 = max(0, (int)floorf(fy)), y1 = min(src_h - 1, y0 + 1);
    float wx = fx - x0, wy = fy - y0;
    float u00 = (float)src[y0 * src_pitch + x0 * 2 + 0];
    float v00 = (float)src[y0 * src_pitch + x0 * 2 + 1];
    float u01 = (float)src[y0 * src_pitch + x1 * 2 + 0];
    float v01 = (float)src[y0 * src_pitch + x1 * 2 + 1];
    float u10 = (float)src[y1 * src_pitch + x0 * 2 + 0];
    float v10 = (float)src[y1 * src_pitch + x0 * 2 + 1];
    float u11 = (float)src[y1 * src_pitch + x1 * 2 + 0];
    float v11 = (float)src[y1 * src_pitch + x1 * 2 + 1];
    float u = (1 - wx) * (1 - wy) * u00 + wx * (1 - wy) * u01
            + (1 - wx) * wy * u10 + wx * wy * u11;
    float v = (1 - wx) * (1 - wy) * v00 + wx * (1 - wy) * v01
            + (1 - wx) * wy * v10 + wx * wy * v11;
    int idx = (dst_y + y) * dst_pitch + (dst_x + x) * 2;
    dst[idx + 0] = (unsigned char)u;
    dst[idx + 1] = (unsigned char)v;
 }
 /* ── C launcher API ───────────────────────────────────────────────────── */
 extern "C" {
 int cfc_cugrid_init(void)
 {
    return 0;
 }
 int cfc_cugrid_fill_nv12(
    CUstream stream,
    CUdeviceptr dst_y, int pitch_y,
    CUdeviceptr dst_uv, int pitch_uv,
    int x, int y, int w, int h,
    int color_y, int color_u, int color_v, int alpha)
 {
    if (w <= 0 || h <= 0) return 0;
    if (alpha <= 0) return 0;
    if (alpha > 255) alpha = 255;
    dim3 block_y_(CFC_BLOCK_X, CFC_BLOCK_Y);
    dim3 grid_y_((w + CFC_BLOCK_X - 1) / CFC_BLOCK_X,
                 (h + CFC_BLOCK_Y - 1) / CFC_BLOCK_Y);
    cfc_kern_fill_y<<<grid_y_, block_y_, 0, (cudaStream_t)stream>>>(
        (unsigned char *)dst_y, pitch_y,
        x, y, w, h, color_y, alpha);
    CHECK_CUDA(cudaGetLastError());
    int uv_x = x / 2, uv_y = y / 2;
    int uv_w = w / 2, uv_h = h / 2;
    dim3 block_uv_(CFC_BLOCK_X, CFC_BLOCK_Y);
    dim3 grid_uv_((uv_w + CFC_BLOCK_X - 1) / CFC_BLOCK_X,
                  (uv_h + CFC_BLOCK_Y - 1) / CFC_BLOCK_Y);
    cfc_kern_fill_uv<<<grid_uv_, block_uv_, 0, (cudaStream_t)stream>>>(
        (unsigned char *)dst_uv, pitch_uv,
        uv_x, uv_y, uv_w, uv_h, color_u, color_v, alpha);
    CHECK_CUDA(cudaGetLastError());
    return 0;
 }
 int cfc_cugrid_resize_nv12(
    CUstream stream,
    CUdeviceptr src_y, int src_w, int src_h, int src_pitch_y,
    CUdeviceptr src_uv,                       int src_pitch_uv,
    CUdeviceptr dst_y, int dst_pitch_y,
    CUdeviceptr dst_uv, int dst_pitch_uv,
    int dst_x, int dst_y_off, int dst_w, int dst_h)
 {
    if (dst_w <= 0 || dst_h <= 0) return 0;
    if (src_w <= 0 || src_h <= 0) return -1;
    dim3 block_y_(CFC_BLOCK_X, CFC_BLOCK_Y);
    dim3 grid_y_((dst_w + CFC_BLOCK_X - 1) / CFC_BLOCK_X,
                 (dst_h + CFC_BLOCK_Y - 1) / CFC_BLOCK_Y);
    cfc_kern_resize_y<<<grid_y_, block_y_, 0, (cudaStream_t)stream>>>(
        (const unsigned char *)src_y, src_w, src_h, src_pitch_y,
        (unsigned char *)dst_y, dst_x, dst_y_off, dst_w, dst_h, dst_pitch_y);
    CHECK_CUDA(cudaGetLastError());
    int uv_src_w = src_w / 2, uv_src_h = src_h / 2;
    int uv_dst_w = dst_w / 2, uv_dst_h = dst_h / 2;
    int uv_dst_x = dst_x / 2, uv_dst_y = dst_y_off / 2;
    dim3 block_uv_(CFC_BLOCK_X, CFC_BLOCK_Y);
    dim3 grid_uv_((uv_dst_w + CFC_BLOCK_X - 1) / CFC_BLOCK_X,
                  (uv_dst_h + CFC_BLOCK_Y - 1) / CFC_BLOCK_Y);
    cfc_kern_resize_uv<<<grid_uv_, block_uv_, 0, (cudaStream_t)stream>>>(
        (const unsigned char *)src_uv, uv_src_w, uv_src_h, src_pitch_uv,
        (unsigned char *)dst_uv, uv_dst_x, uv_dst_y, uv_dst_w, uv_dst_h, dst_pitch_uv);
    CHECK_CUDA(cudaGetLastError());
    return 0;
 }
 }  /* extern "C" */