From 1e2b5d4e16e527a97edf772570b641ef918f63ce Mon Sep 17 00:00:00 2001
From: Evgeny Demchenko <demchenkoev@gmail.com>
Date: Wed, 3 Jun 2026 05:01:49 +0100
Subject: [PATCH] =?UTF-8?q?Phase=202:=20composer=20+=20libcugrid=20(N=20?=
 =?UTF-8?q?=D0=B8=D1=81=D1=82=D0=BE=D1=87=D0=BD=D0=B8=D0=BA=D0=BE=D0=B2=20?=
 =?UTF-8?q?=E2=86=92=202x2=20grid=20=D0=B2=20NV12=20=D0=B1=D1=83=D1=84?=
 =?UTF-8?q?=D0=B5=D1=80)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Multi-source композитор работает на 4K @ 25fps стабильно. Live-тест с
4 камерами (parking, back_yard, front_yard, gate_lpr): все 4 active,
350 кадров за 14с, 27.6 МБ H.264 файл, кадр декодируется ffmpeg'ом
с корректным 2x2 layout'ом.

Содержимое:

- include/cuframes_composer/cugrid.h — публичный API libcugrid:
  cfc_cugrid_fill_nv12 (region fill с alpha blend),
  cfc_cugrid_resize_nv12 (bilinear scale в rect).
- src/cugrid/cugrid.cu — извлечённые из vf_cuda_grid kernel'ы
  (Y+UV fill + bilinear resize), объединены с C launcher'ом в одном
  .cu файле, под LGPL-2.1+.
- include/cuframes_composer/composer.h — публичный API композитора:
  cfc_composer_cell_t для layout, get_health для observability.
- src/composer.c — manager N cfc_source_t + единый NV12 output buffer
  (cuMemAlloc, переиспользуется на каждом compose'е). compose_clear
  fillит фон BT.709-чёрным, compose_cell делает resize ACTIVE
  источника или оставляет blackout для DEAD/STALE/CONNECTING.
- examples/grid_record — Phase 2 smoke test: N --cell ключ,x,y,w,h
  → grid composer → NVENC → file.

Сборка: добавлен LANGUAGES CUDA и CMAKE_CUDA_ARCHITECTURES 89;120
(Ada + Blackwell). Compile options раздельные для C и CUDA
(-Wpedantic не подходит для .cu).

Phase 2 RTSP push отложен на отдельный commit — будет через pipe-out
к локальному ffmpeg'у, который публикует в mediamtx (вариант
утверждён в Q2 дизайн-документа).
---
 CMakeLists.txt                       |   9 +-
 examples/CMakeLists.txt              |   5 +
 examples/grid_record.c               | 270 +++++++++++++++++++++++++++
 include/cuframes_composer/composer.h |  97 ++++++++++
 include/cuframes_composer/cugrid.h   |  81 ++++++++
 src/CMakeLists.txt                   |  36 ++--
 src/composer.c                       | 270 +++++++++++++++++++++++++++
 src/cugrid/cugrid.cu                 | 193 +++++++++++++++++++
 8 files changed, 949 insertions(+), 12 deletions(-)
 create mode 100644 examples/grid_record.c
 create mode 100644 include/cuframes_composer/composer.h
 create mode 100644 include/cuframes_composer/cugrid.h
 create mode 100644 src/composer.c
 create mode 100644 src/cugrid/cugrid.cu

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f1ae01c..5159193 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,13 +2,20 @@ cmake_minimum_required(VERSION 3.20)
 project(cuframes-composer
     VERSION 0.1.0
     DESCRIPTION "Multi-source video grid composer на CUDA + NVENC + RTSP"
-    LANGUAGES C
+    LANGUAGES C CUDA
 )
 
 set(CMAKE_C_STANDARD 11)
 set(CMAKE_C_STANDARD_REQUIRED ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
+# CUDA архитектуры. RTX 5090 = sm_120 (Blackwell consumer), но также 5090 не
+# поддерживает legacy sm_75, поэтому 89 (Ada Lovelace, RTX 4090) для совместимости
+# с тестовым окружением + 120 для прода.
+if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+    set(CMAKE_CUDA_ARCHITECTURES "89;120")
+endif()
+
 if(NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE Release)
 endif()
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 9f40164..8591b90 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -10,3 +10,8 @@
 add_executable(simple_record simple_record.c)
 target_link_libraries(simple_record PRIVATE cuframes_composer_static)
 target_include_directories(simple_record PRIVATE ${CMAKE_SOURCE_DIR}/include)
+
+# Phase 2 — multi-source grid в файл.
+add_executable(grid_record grid_record.c)
+target_link_libraries(grid_record PRIVATE cuframes_composer_static)
+target_include_directories(grid_record PRIVATE ${CMAKE_SOURCE_DIR}/include)
diff --git a/examples/grid_record.c b/examples/grid_record.c
new file mode 100644
index 0000000..d8e53d7
--- /dev/null
+++ b/examples/grid_record.c
@@ -0,0 +1,270 @@
+/* grid_record — Phase 2 smoke test.
+ *
+ * Подписывается на 4 cuframes-источника (4 камеры), композирует их в 2×2 grid
+ * через cfc_composer, кодирует через NVENC, пишет H.264 в файл.
+ *
+ * Layout 2×2 1080p:
+ *   Output: 3840×2160 (4K)
+ *   Cells:  4 шт. 1920×1080 в углах
+ *
+ * Использование:
+ *   grid_record --out 4k.h264 \
+ *               --cell cam-parking,0,0,1920,1080 \
+ *               --cell cam-back_yard,1920,0,1920,1080 \
+ *               --cell cam-front_yard,0,1080,1920,1080 \
+ *               --cell cam-gate_lpr,1920,1080,1920,1080 \
+ *               --seconds 15
+ *
+ * Лицензия: LGPL-2.1+
+ */
+
+#include "../include/cuframes_composer/composer.h"
+#include "../include/cuframes_composer/nvenc.h"
+
+#include <cuda.h>
+
+#include <errno.h>
+#include <getopt.h>
+#include <signal.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+
+#define MAX_CELLS 16
+
+static volatile sig_atomic_t g_stop = 0;
+
+static void on_sig(int s) { (void)s; g_stop = 1; }
+
+typedef struct write_ctx {
+    FILE *fp;
+    uint64_t bytes_written;
+    uint64_t frames_encoded;
+    uint64_t idr_count;
+} write_ctx_t;
+
+static void on_bitstream(const uint8_t *bs, size_t size, int64_t pts_ns,
+                          int is_idr, void *user)
+{
+    (void)pts_ns;
+    write_ctx_t *ctx = (write_ctx_t *)user;
+    if (fwrite(bs, 1, size, ctx->fp) == size) {
+        ctx->bytes_written += size;
+        ctx->frames_encoded++;
+        if (is_idr) ctx->idr_count++;
+    }
+}
+
+static int parse_cell(const char *arg, cfc_composer_cell_t *out,
+                       char *key_storage)
+{
+    /* Формат: key,x,y,w,h */
+    char buf[128];
+    strncpy(buf, arg, sizeof(buf) - 1);
+    buf[sizeof(buf) - 1] = '\0';
+    char *tok = strtok(buf, ",");
+    if (!tok) return -1;
+    strncpy(key_storage, tok, 63);
+    key_storage[63] = '\0';
+    out->source_key = key_storage;
+    tok = strtok(NULL, ","); if (!tok) return -1; out->x = atoi(tok);
+    tok = strtok(NULL, ","); if (!tok) return -1; out->y = atoi(tok);
+    tok = strtok(NULL, ","); if (!tok) return -1; out->w = atoi(tok);
+    tok = strtok(NULL, ","); if (!tok) return -1; out->h = atoi(tok);
+    return 0;
+}
+
+static const char *cu_err(CUresult r)
+{
+    const char *s = NULL;
+    cuGetErrorString(r, &s);
+    return s ? s : "?";
+}
+
+int main(int argc, char **argv)
+{
+    const char *out_path = NULL;
+    int fps = 25, bitrate = 10000, max_seconds = 0;
+    int out_w = 3840, out_h = 2160;
+    cfc_composer_cell_t cells[MAX_CELLS] = { 0 };
+    static char cell_keys[MAX_CELLS][64];
+    int num_cells = 0;
+
+    static struct option opts[] = {
+        {"out",     required_argument, 0, 'o'},
+        {"cell",    required_argument, 0, 'c'},
+        {"fps",     required_argument, 0, 'f'},
+        {"bitrate", required_argument, 0, 'b'},
+        {"width",   required_argument, 0, 'W'},
+        {"height",  required_argument, 0, 'H'},
+        {"seconds", required_argument, 0, 's'},
+        {0, 0, 0, 0},
+    };
+    int c;
+    while ((c = getopt_long(argc, argv, "o:c:f:b:W:H:s:", opts, NULL)) != -1) {
+        switch (c) {
+        case 'o': out_path = optarg; break;
+        case 'c':
+            if (num_cells >= MAX_CELLS) {
+                fprintf(stderr, "max %d cells\n", MAX_CELLS);
+                return 1;
+            }
+            if (parse_cell(optarg, &cells[num_cells], cell_keys[num_cells]) != 0) {
+                fprintf(stderr, "invalid --cell '%s' (key,x,y,w,h)\n", optarg);
+                return 1;
+            }
+            num_cells++;
+            break;
+        case 'f': fps = atoi(optarg); break;
+        case 'b': bitrate = atoi(optarg); break;
+        case 'W': out_w = atoi(optarg); break;
+        case 'H': out_h = atoi(optarg); break;
+        case 's': max_seconds = atoi(optarg); break;
+        default: return 1;
+        }
+    }
+    if (!out_path || num_cells == 0) {
+        fprintf(stderr,
+            "Использование: %s --out <file.h264> --cell key,x,y,w,h [--cell ...]\n"
+            "                [--width 3840] [--height 2160] [--fps 25]\n"
+            "                [--bitrate 10000] [--seconds N]\n",
+            argv[0]);
+        return 1;
+    }
+
+    signal(SIGINT, on_sig);
+    signal(SIGTERM, on_sig);
+
+    /* CUDA primary context. */
+    CUresult cr = cuInit(0);
+    if (cr != CUDA_SUCCESS) { fprintf(stderr, "cuInit: %s\n", cu_err(cr)); return 1; }
+    CUdevice dev;
+    cuDeviceGet(&dev, 0);
+    CUcontext ctx;
+    cuDevicePrimaryCtxRetain(&ctx, dev);
+    cuCtxPushCurrent(ctx);
+
+    /* Composer. */
+    cfc_composer_config_t ccfg = {
+        .width = out_w,
+        .height = out_h,
+        .cells = cells,
+        .num_cells = num_cells,
+        .cuda_device = 0,
+    };
+    cfc_composer_t *comp = NULL;
+    if (cfc_composer_create(&ccfg, &comp) != 0) {
+        fprintf(stderr, "cfc_composer_create failed\n");
+        return 1;
+    }
+    fprintf(stderr, "[grid_record] composer %dx%d, %d ячеек\n",
+            out_w, out_h, num_cells);
+
+    /* Encoder. */
+    cfc_encoder_config_t ecfg = {
+        .cuda_ctx = ctx,
+        .width = out_w,
+        .height = out_h,
+        .fps_num = fps,
+        .fps_den = 1,
+        .bitrate_kbps = bitrate,
+        .gop_size = fps,
+        .num_b_frames = 0,
+        .preset = "ll",
+    };
+    cfc_encoder_t *enc = NULL;
+    if (cfc_encoder_create(&ecfg, &enc) != 0) {
+        fprintf(stderr, "cfc_encoder_create failed\n");
+        cfc_composer_destroy(comp);
+        return 1;
+    }
+
+    /* Output file. */
+    write_ctx_t wctx = { 0 };
+    wctx.fp = fopen(out_path, "wb");
+    if (!wctx.fp) {
+        fprintf(stderr, "fopen(%s): %s\n", out_path, strerror(errno));
+        cfc_encoder_destroy(enc);
+        cfc_composer_destroy(comp);
+        return 1;
+    }
+    fprintf(stderr, "[grid_record] начало записи в %s (Ctrl+C для остановки)\n",
+            out_path);
+
+    /* Main loop — frame cadence по wall clock'у. */
+    struct timespec ts_start;
+    clock_gettime(CLOCK_MONOTONIC, &ts_start);
+    int64_t start_us = (int64_t)ts_start.tv_sec * 1000000 + ts_start.tv_nsec / 1000;
+    int64_t frame_us = 1000000 / fps;
+    int64_t next_us = start_us;
+
+    while (!g_stop) {
+        struct timespec now;
+        clock_gettime(CLOCK_MONOTONIC, &now);
+        int64_t now_us = (int64_t)now.tv_sec * 1000000 + now.tv_nsec / 1000;
+        if (now_us < next_us) {
+            int64_t sleep_us = next_us - now_us;
+            if (sleep_us > 1000000) sleep_us = 1000000;
+            struct timespec ts = {
+                .tv_sec = sleep_us / 1000000,
+                .tv_nsec = (sleep_us % 1000000) * 1000,
+            };
+            nanosleep(&ts, NULL);
+            continue;
+        }
+        next_us += frame_us;
+
+        CUdeviceptr out_y = 0;
+        int out_pitch = 0, oW = 0, oH = 0;
+        if (cfc_composer_compose(comp, &out_y, &out_pitch, &oW, &oH) != 0) {
+            fprintf(stderr, "[grid_record] compose failed\n");
+            break;
+        }
+
+        int64_t pts_ns = (now_us - start_us) * 1000;
+        if (cfc_encoder_encode_frame(enc, out_y, out_pitch, pts_ns,
+                                      on_bitstream, &wctx) != 0) {
+            fprintf(stderr, "[grid_record] encode failed\n");
+            break;
+        }
+
+        if (wctx.frames_encoded > 0 && wctx.frames_encoded % 50 == 0) {
+            double elapsed = (now_us - start_us) / 1e6;
+            cfc_composer_health_t h;
+            cfc_composer_get_health(comp, &h);
+            fprintf(stderr,
+                "[grid_record] %llu кадров, %llu IDR, %.1f МБ за %.1fс (%.1f fps) | "
+                "src active=%d stale=%d dead=%d\n",
+                (unsigned long long)wctx.frames_encoded,
+                (unsigned long long)wctx.idr_count,
+                wctx.bytes_written / 1048576.0,
+                elapsed,
+                wctx.frames_encoded / elapsed,
+                h.active, h.stale, h.dead);
+        }
+
+        if (max_seconds > 0 && (now_us - start_us) / 1000000 >= max_seconds) {
+            fprintf(stderr, "[grid_record] лимит %dс\n", max_seconds);
+            break;
+        }
+    }
+
+    fprintf(stderr, "[grid_record] flush encoder\n");
+    cfc_encoder_flush(enc, on_bitstream, &wctx);
+
+    fprintf(stderr,
+        "[grid_record] итого: %llu кадров, %llu IDR, %.2f МБ\n",
+        (unsigned long long)wctx.frames_encoded,
+        (unsigned long long)wctx.idr_count,
+        wctx.bytes_written / 1048576.0);
+
+    fclose(wctx.fp);
+    cfc_encoder_destroy(enc);
+    cfc_composer_destroy(comp);
+    cuCtxPopCurrent(NULL);
+    cuDevicePrimaryCtxRelease(dev);
+    return 0;
+}
diff --git a/include/cuframes_composer/composer.h b/include/cuframes_composer/composer.h
new file mode 100644
index 0000000..12d350a
--- /dev/null
+++ b/include/cuframes_composer/composer.h
@@ -0,0 +1,97 @@
+/* cuframes-composer — высокоуровневый композитор N источников в grid.
+ *
+ * Управляет:
+ *   - N cfc_source (фоновые подписки на cuframes-publisher'ы)
+ *   - Layout (cell positions + size)
+ *   - Output NV12 buffer (cuMemAlloc, переиспользуется)
+ *   - Композиция на каждом тике: clear background + resize/blit для каждого ACTIVE,
+ *     fill чёрным для DEAD/STALE.
+ *
+ * Phase 2 архитектура:
+ *   Композитор работает синхронно: caller вызывает cfc_composer_compose() →
+ *   композитор для каждого источника берёт snapshot и пишет в output. После
+ *   вызова output NV12 buffer готов к encode.
+ *
+ * Каноническая интеграция:
+ *   for (;;) {
+ *       cfc_composer_compose(comp);          // grid готов в output buffer
+ *       cfc_encoder_encode_frame(enc, ...);  // → H.264
+ *   }
+ *
+ * Лицензия: LGPL-2.1+
+ */
+
+#ifndef CUFRAMES_COMPOSER_COMPOSER_H
+#define CUFRAMES_COMPOSER_COMPOSER_H
+
+#include "source.h"
+
+#include <cuda.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Описание одной ячейки grid'а — где на output буфере рисуется источник.
+ * Все координаты в full-res пикселях, должны быть чётными. */
+typedef struct cfc_composer_cell {
+    const char *source_key;           /* cuframes key, например "cam-parking" */
+    int x, y;                         /* top-left угол на output буфере */
+    int w, h;                         /* размер ячейки */
+} cfc_composer_cell_t;
+
+typedef struct cfc_composer_config {
+    /* Output буфер (одно фиксированное разрешение для всего grid'а). */
+    int width;                        /* output ширина (например 3840 для 2×2 1080p) */
+    int height;                       /* output высота (например 2160 для 2×2 1080p) */
+
+    /* Cells конфигурация. Массив указателей на cells (для удобства user'а). */
+    const cfc_composer_cell_t *cells; /* array, не копируется — caller держит */
+    int num_cells;
+
+    /* CUDA устройство. */
+    int cuda_device;                  /* индекс, обычно 0 */
+
+    /* Backgound цвет (BT.709 limited): Y=16/U=128/V=128 = чёрный. */
+    int bg_y, bg_u, bg_v;
+
+    /* Параметры stale/dead для источников. */
+    int reconnect_min_ms;             /* default 1000 */
+    int reconnect_max_ms;             /* default 30000 */
+    int stale_threshold_ms;           /* default 500 */
+    int dead_threshold_ms;            /* default 5000 */
+} cfc_composer_config_t;
+
+typedef struct cfc_composer cfc_composer_t;
+
+/* Создать композитор. Выделяет output NV12 буфер, запускает N source thread'ов. */
+int cfc_composer_create(const cfc_composer_config_t *cfg, cfc_composer_t **out);
+
+/* Скомпоновать один кадр. Вернёт указатели на output NV12 (Y + UV) и pitch.
+ * Указатели действительны до следующего compose. */
+int cfc_composer_compose(
+    cfc_composer_t *comp,
+    CUdeviceptr *out_y_ptr,
+    int *out_pitch_y,
+    int *out_width,
+    int *out_height
+);
+
+/* Получить layout статистику по источникам — для debug / health-репортов. */
+typedef struct cfc_composer_health {
+    int total;                        /* всего источников */
+    int active;                       /* в состоянии ACTIVE */
+    int stale;                        /* в STALE */
+    int dead;                         /* DEAD/DISCONNECTED/CONNECTING */
+} cfc_composer_health_t;
+int cfc_composer_get_health(cfc_composer_t *comp, cfc_composer_health_t *out);
+
+/* Уничтожить композитор. */
+int cfc_composer_destroy(cfc_composer_t *comp);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* CUFRAMES_COMPOSER_COMPOSER_H */
diff --git a/include/cuframes_composer/cugrid.h b/include/cuframes_composer/cugrid.h
new file mode 100644
index 0000000..25eb44c
--- /dev/null
+++ b/include/cuframes_composer/cugrid.h
@@ -0,0 +1,81 @@
+/* cuframes-composer — CUDA kernels для grid-композиции NV12 кадров.
+ *
+ * Извлечено из vf_cuda_grid (FFmpeg out-of-tree filter), под LGPL-2.1+.
+ * Все операции работают на NV12-кадрах:
+ *   - Y plane: full resolution, 1 byte per pixel
+ *   - UV plane: half resolution, 2 bytes per pixel (interleaved U,V)
+ *
+ * Все операции:
+ *   - Принимают CUstream (NULL = default stream)
+ *   - Не аллоцируют память (caller выделяет dst буфер заранее)
+ *   - Idempotent для re-use (composer вызывает на каждом кадре)
+ *
+ * Lifecycle:
+ *   cfc_cugrid_init() один раз при старте композитора (no-op для CUDA, но
+ *                     резервирован под cuModuleLoad если перейдём на PTX)
+ *
+ * Лицензия: LGPL-2.1+
+ */
+
+#ifndef CUFRAMES_COMPOSER_CUGRID_H
+#define CUFRAMES_COMPOSER_CUGRID_H
+
+#include <cuda.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Lazy init (no-op в текущей реализации; зарезервировано под future PTX module). */
+int cfc_cugrid_init(void);
+
+/* Заполнить прямоугольник на NV12 кадре solid color'ом с alpha-блендингом.
+ *
+ *   dst_y, dst_uv     — указатели на Y и UV plane соответственно.
+ *   pitch_y, pitch_uv — pitch в bytes (для NV12 обычно равны и кратны 256).
+ *   x, y, w, h        — прямоугольник в full-res пикселях (chroma usually пишется
+ *                       автоматически на половинном rastr'е).
+ *                       w и h должны быть чётными (требование 4:2:0 subsampling'а).
+ *   color_y/u/v       — компоненты цвета в BT.709 limited range (Y: 16-235, UV: 16-240).
+ *                       Для чёрного: Y=16, U=128, V=128.
+ *   alpha             — 0..255 (0 = transparent, 255 = opaque).
+ *
+ * Возвращает 0 при успехе.
+ */
+int cfc_cugrid_fill_nv12(
+    CUstream stream,
+    CUdeviceptr dst_y, int pitch_y,
+    CUdeviceptr dst_uv, int pitch_uv,
+    int x, int y, int w, int h,
+    int color_y, int color_u, int color_v, int alpha
+);
+
+/* Resize NV12 src → rect (dst_x, dst_y, dst_w, dst_h) на NV12 dst.
+ *
+ * Bilinear interpolation на Y и UV. Y и UV plane src'а должны быть одним
+ * allocation'ом (NV12 layout) или хотя бы иметь корректные указатели каждый.
+ *
+ *   src_y, src_uv      — src указатели.
+ *   src_w, src_h       — размер src в full-res пикселях.
+ *   src_pitch_y/uv     — pitch'ы (для NV12 cuframes pitch_y == pitch_uv).
+ *   dst_y, dst_uv      — dst указатели.
+ *   dst_pitch_y/uv     — pitch'ы dst NV12 кадра.
+ *   dst_x, dst_y_off,  — координата top-left прямоугольника на dst (full-res пиксели).
+ *   dst_w, dst_h         dst_x и dst_y_off, dst_w, dst_h ДОЛЖНЫ быть чётными
+ *                       (для UV/chroma subsampling'а).
+ */
+int cfc_cugrid_resize_nv12(
+    CUstream stream,
+    CUdeviceptr src_y, int src_w, int src_h, int src_pitch_y,
+    CUdeviceptr src_uv,                       int src_pitch_uv,
+    CUdeviceptr dst_y, int dst_pitch_y,
+    CUdeviceptr dst_uv, int dst_pitch_uv,
+    int dst_x, int dst_y_off, int dst_w, int dst_h
+);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* CUFRAMES_COMPOSER_CUGRID_H */
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 54f0a15..17867e1 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,22 +1,27 @@
 # Основная библиотека композитора — `libcuframes_composer.so`. Содержит:
-#   - source.c       подписка к cuframes публишеру + state machine
-#   - nvenc_loader.c dlopen libnvidia-encode.so + загрузка API таблицы
-#   - nvenc.c        обвязка вокруг NVENC SDK (init/encode/teardown)
+#   - source.c          подписка к cuframes публишеру + state machine
+#   - nvenc_loader.c    dlopen libnvidia-encode.so + загрузка API таблицы
+#   - nvenc.c           обвязка вокруг NVENC SDK (init/encode/teardown)
+#   - composer.c        Phase 2: multi-source grid композитор
+#   - cugrid/cugrid.cu  Phase 2: CUDA kernels (resize, fill) для NV12 grid'а
 #
 # Дальше по фазам:
-#   Phase 2: compose.c (CUDA composition), ringbuf.c (SPSC swap)
-#   Phase 3: rtsp_publisher.c, rtp_h264.c, overlay.c, png_decode.c, text_render.c
+#   Phase 3: overlay.c, png_decode.c, text_render.c, rtsp_publisher.c, rtp_h264.c
 #   Phase 4: control_zmq.c
 #   Phase 5: health_mqtt.c
 
-set(COMPOSER_SOURCES
+set(COMPOSER_SOURCES_C
     source.c
     nvenc_loader.c
     nvenc.c
+    composer.c
+)
+set(COMPOSER_SOURCES_CU
+    cugrid/cugrid.cu
 )
 
-add_library(cuframes_composer SHARED ${COMPOSER_SOURCES})
-add_library(cuframes_composer_static STATIC ${COMPOSER_SOURCES})
+add_library(cuframes_composer SHARED ${COMPOSER_SOURCES_C} ${COMPOSER_SOURCES_CU})
+add_library(cuframes_composer_static STATIC ${COMPOSER_SOURCES_C} ${COMPOSER_SOURCES_CU})
 
 foreach(target cuframes_composer cuframes_composer_static)
     target_include_directories(${target}
@@ -28,10 +33,15 @@ foreach(target cuframes_composer cuframes_composer_static)
             ${NVCODEC_HEADERS_DIR}
     )
     target_compile_features(${target} PRIVATE c_std_11)
+    # C-only флаги (для CUDA свои дефолты, -Wpedantic не подходит для .cu).
     target_compile_options(${target} PRIVATE
-        -Wall -Wextra -Wpedantic
-        $<$<CONFIG:Debug>:-O0 -g3>
-        $<$<CONFIG:Release>:-O2 -g>
+        $<$<COMPILE_LANGUAGE:C>:-Wall>
+        $<$<COMPILE_LANGUAGE:C>:-Wextra>
+        $<$<COMPILE_LANGUAGE:C>:-Wpedantic>
+        $<$<AND:$<COMPILE_LANGUAGE:C>,$<CONFIG:Debug>>:-O0>
+        $<$<AND:$<COMPILE_LANGUAGE:C>,$<CONFIG:Debug>>:-g3>
+        $<$<AND:$<COMPILE_LANGUAGE:C>,$<CONFIG:Release>>:-O2>
+        $<$<AND:$<COMPILE_LANGUAGE:C>,$<CONFIG:Release>>:-g>
     )
     target_link_libraries(${target}
         PUBLIC
@@ -42,6 +52,10 @@ foreach(target cuframes_composer cuframes_composer_static)
             ${LIBDL_LIBRARY}           # для dlopen libnvidia-encode.so
             rt
     )
+    # CUDA properties.
+    set_target_properties(${target} PROPERTIES
+        CUDA_SEPARABLE_COMPILATION OFF
+    )
 endforeach()
 
 set_target_properties(cuframes_composer PROPERTIES
diff --git a/src/composer.c b/src/composer.c
new file mode 100644
index 0000000..3b999d0
--- /dev/null
+++ b/src/composer.c
@@ -0,0 +1,270 @@
+/* Реализация cfc_composer_t — multi-source grid композитор.
+ *
+ * Owns:
+ *   - N cfc_source_t (по одному на ячейку grid'а)
+ *   - один NV12 output buffer (cuMemAlloc — staging для NVENC encoder'а)
+ *   - statistics для health-репортов
+ *
+ * Compose-цикл:
+ *   1) cuMemsetD8 → быстрое черный fill всего Y plane (16=BT.709 black)
+ *      + UV plane заполняется отдельно (128,128).
+ *   2) Для каждой ячейки:
+ *      a) get_latest snapshot.
+ *      b) ACTIVE → cfc_cugrid_resize_nv12 (src VMM → dst rect)
+ *      c) DEAD/STALE → cfc_cugrid_fill_nv12 чёрным с alpha=255 уже сделано,
+ *         тут лучше визуально показать что источник упал, поэтому в Phase 3
+ *         поверх blackout рисуется текст «NO SIGNAL» через overlay'и.
+ *   3) cudaStreamSynchronize → output готов.
+ *
+ * Phase 2 упрощения:
+ *   - Sync compose на default stream. Stream pipelining — Phase 3+.
+ *   - Без double buffering. encode и compose делаются строго последовательно.
+ *
+ * Лицензия: LGPL-2.1+
+ */
+
+#include "../include/cuframes_composer/composer.h"
+#include "../include/cuframes_composer/cugrid.h"
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#define CFC_COMPOSER_MAX_CELLS 64
+
+struct cfc_composer {
+    cfc_composer_config_t cfg;
+
+    /* Копии cells (caller владеет original config'ом). source_key копируется
+     * в персистентную строку чтобы cfc_source_t могла на неё указывать. */
+    cfc_composer_cell_t cells[CFC_COMPOSER_MAX_CELLS];
+    char cell_keys[CFC_COMPOSER_MAX_CELLS][64];
+    int num_cells;
+
+    /* Источники по индексу cell'а (или NULL если cell — статичная картинка). */
+    cfc_source_t *sources[CFC_COMPOSER_MAX_CELLS];
+
+    /* Output NV12 буфер: один contiguous allocation, Y plane (pitch * h) +
+     * UV plane (pitch * h/2). Pitch выравнен на 256 байт. */
+    CUdeviceptr output_ptr;
+    int output_pitch_y;
+    int output_pitch_uv;
+    size_t output_size;
+
+    /* CUDA stream для compose (Phase 2 — default stream = 0). */
+    cudaStream_t stream;
+};
+
+/* ── Helpers ──────────────────────────────────────────────────────────── */
+
+static int round_up_pitch(int w)
+{
+    return (w + 255) & ~255;
+}
+
+static void *cu_ptr(CUdeviceptr p) { return (void *)(uintptr_t)p; }
+
+/* ── Compose ──────────────────────────────────────────────────────────── */
+
+static int compose_clear(cfc_composer_t *comp)
+{
+    /* Y plane → 16 (BT.709 black). */
+    cudaError_t e = cudaMemsetAsync(
+        cu_ptr(comp->output_ptr), comp->cfg.bg_y,
+        (size_t)comp->output_pitch_y * comp->cfg.height,
+        comp->stream);
+    if (e != cudaSuccess) {
+        fprintf(stderr, "[cfc/composer] Y memset failed: %s\n", cudaGetErrorString(e));
+        return -1;
+    }
+
+    /* UV plane → нужны два значения (U=128, V=128), не один. Делаем fill
+     * через тот же cfc_cugrid_fill_nv12 которым fillим ячейки. Прокидываем
+     * alpha=255 чтобы перезатереть полностью. */
+    CUdeviceptr uv = comp->output_ptr +
+                     (size_t)comp->output_pitch_y * comp->cfg.height;
+
+    /* Просто memset UV не подходит — там interleaved пары. Делаем fill_nv12
+     * с alpha=255, тогда формула станет dst = fill * 255 / 255 = fill. */
+    int rc = cfc_cugrid_fill_nv12(
+        (CUstream)comp->stream,
+        /* Y уже сделан выше — но fill_nv12 повторно fillит Y. Передаём
+         * y_color = bg_y, alpha=255 — get тот же результат, минор waste.
+         * Phase 2 acceptable, в Phase 3 разделим Y/UV fillы. */
+        comp->output_ptr, comp->output_pitch_y,
+        uv, comp->output_pitch_uv,
+        0, 0, comp->cfg.width, comp->cfg.height,
+        comp->cfg.bg_y, comp->cfg.bg_u, comp->cfg.bg_v, 255);
+    return rc;
+}
+
+static int compose_cell(cfc_composer_t *comp, int idx)
+{
+    const cfc_composer_cell_t *cell = &comp->cells[idx];
+    cfc_source_t *src = comp->sources[idx];
+    if (!src) return 0;
+
+    cfc_source_snapshot_t snap;
+    cfc_source_get_latest(src, &snap);
+
+    if (snap.state != CFC_SOURCE_ACTIVE || snap.width <= 0) {
+        /* DEAD/STALE/CONNECTING — оставляем чёрный (уже clear'нут).
+         * Phase 3 добавит overlay «NO SIGNAL». */
+        return 0;
+    }
+
+    CUdeviceptr uv = comp->output_ptr +
+                     (size_t)comp->output_pitch_y * comp->cfg.height;
+    /* Source NV12 layout: Y (pitch_y * height) + UV (pitch_y * height/2)
+     * непрерывно. Указатель на UV plane = ptr + pitch_y * height. */
+    CUdeviceptr src_uv = snap.ptr + (size_t)snap.pitch_y * snap.height;
+
+    return cfc_cugrid_resize_nv12(
+        (CUstream)comp->stream,
+        snap.ptr, snap.width, snap.height, snap.pitch_y,
+        src_uv, snap.pitch_uv,
+        comp->output_ptr, comp->output_pitch_y,
+        uv, comp->output_pitch_uv,
+        cell->x, cell->y, cell->w, cell->h);
+}
+
+/* ── Public API ───────────────────────────────────────────────────────── */
+
+int cfc_composer_create(const cfc_composer_config_t *cfg, cfc_composer_t **out)
+{
+    if (!cfg || !out) return -1;
+    if (cfg->width <= 0 || cfg->height <= 0) return -1;
+    if (cfg->num_cells <= 0 || cfg->num_cells > CFC_COMPOSER_MAX_CELLS) return -1;
+    if (!cfg->cells) return -1;
+
+    cfc_composer_t *comp = calloc(1, sizeof(*comp));
+    if (!comp) return -1;
+    comp->cfg = *cfg;
+    comp->num_cells = cfg->num_cells;
+    comp->stream = 0;  /* default stream Phase 2 */
+
+    /* Дефолты для bg цвета (если caller не задал). */
+    if (!comp->cfg.bg_y) comp->cfg.bg_y = 16;
+    if (!comp->cfg.bg_u) comp->cfg.bg_u = 128;
+    if (!comp->cfg.bg_v) comp->cfg.bg_v = 128;
+
+    /* Сохраняем cells + копируем source_key в персистентное хранилище. */
+    for (int i = 0; i < cfg->num_cells; i++) {
+        comp->cells[i] = cfg->cells[i];
+        if (cfg->cells[i].source_key) {
+            strncpy(comp->cell_keys[i], cfg->cells[i].source_key,
+                    sizeof(comp->cell_keys[i]) - 1);
+            comp->cells[i].source_key = comp->cell_keys[i];
+        }
+    }
+
+    /* Выделяем output NV12 буфер. */
+    comp->output_pitch_y = round_up_pitch(cfg->width);
+    comp->output_pitch_uv = comp->output_pitch_y;
+    comp->output_size = (size_t)comp->output_pitch_y * cfg->height +
+                        (size_t)comp->output_pitch_uv * (cfg->height / 2);
+
+    CUresult cr = cuMemAlloc(&comp->output_ptr, comp->output_size);
+    if (cr != CUDA_SUCCESS) {
+        const char *es = NULL; cuGetErrorString(cr, &es);
+        fprintf(stderr, "[cfc/composer] cuMemAlloc(%zu) failed: %s\n",
+                comp->output_size, es ? es : "?");
+        free(comp);
+        return -1;
+    }
+
+    /* Создаём источники по одному на cell с source_key. */
+    for (int i = 0; i < comp->num_cells; i++) {
+        if (!comp->cells[i].source_key) continue;
+
+        char name[32];
+        snprintf(name, sizeof(name), "composer-%d", i);
+
+        cfc_source_config_t scfg = {
+            .key = comp->cells[i].source_key,
+            .consumer_name = name,
+            .cuda_device = cfg->cuda_device,
+            .reconnect_min_ms = cfg->reconnect_min_ms,
+            .reconnect_max_ms = cfg->reconnect_max_ms,
+            .stale_threshold_ms = cfg->stale_threshold_ms,
+            .dead_threshold_ms = cfg->dead_threshold_ms,
+        };
+        if (cfc_source_create(&scfg, &comp->sources[i]) != 0) {
+            fprintf(stderr,
+                    "[cfc/composer] cfc_source_create failed для cell %d (%s)\n",
+                    i, comp->cells[i].source_key);
+            /* Не fatal — DEAD source просто будет показывать blackout.
+             * Ячейка остаётся NULL → compose_cell сразу выйдет. */
+        }
+    }
+
+    cfc_cugrid_init();
+
+    *out = comp;
+    return 0;
+}
+
+int cfc_composer_compose(cfc_composer_t *comp,
+                          CUdeviceptr *out_y_ptr,
+                          int *out_pitch_y,
+                          int *out_width,
+                          int *out_height)
+{
+    if (!comp) return -1;
+
+    if (compose_clear(comp) != 0) return -1;
+
+    for (int i = 0; i < comp->num_cells; i++) {
+        if (compose_cell(comp, i) != 0) {
+            fprintf(stderr, "[cfc/composer] compose_cell %d failed\n", i);
+            /* Не fatal — продолжаем с остальными ячейками. */
+        }
+    }
+
+    cudaError_t e = cudaStreamSynchronize(comp->stream);
+    if (e != cudaSuccess) {
+        fprintf(stderr, "[cfc/composer] stream sync failed: %s\n",
+                cudaGetErrorString(e));
+        return -1;
+    }
+
+    if (out_y_ptr) *out_y_ptr = comp->output_ptr;
+    if (out_pitch_y) *out_pitch_y = comp->output_pitch_y;
+    if (out_width) *out_width = comp->cfg.width;
+    if (out_height) *out_height = comp->cfg.height;
+    return 0;
+}
+
+int cfc_composer_get_health(cfc_composer_t *comp, cfc_composer_health_t *out)
+{
+    if (!comp || !out) return -1;
+    memset(out, 0, sizeof(*out));
+    out->total = comp->num_cells;
+    for (int i = 0; i < comp->num_cells; i++) {
+        if (!comp->sources[i]) {
+            out->dead++;
+            continue;
+        }
+        cfc_source_snapshot_t snap;
+        cfc_source_get_latest(comp->sources[i], &snap);
+        switch (snap.state) {
+        case CFC_SOURCE_ACTIVE: out->active++; break;
+        case CFC_SOURCE_STALE:  out->stale++; break;
+        default:                out->dead++; break;
+        }
+    }
+    return 0;
+}
+
+int cfc_composer_destroy(cfc_composer_t *comp)
+{
+    if (!comp) return 0;
+    for (int i = 0; i < comp->num_cells; i++) {
+        if (comp->sources[i]) cfc_source_destroy(comp->sources[i]);
+    }
+    if (comp->output_ptr) cuMemFree(comp->output_ptr);
+    free(comp);
+    return 0;
+}
diff --git a/src/cugrid/cugrid.cu b/src/cugrid/cugrid.cu
new file mode 100644
index 0000000..30a64a9
--- /dev/null
+++ b/src/cugrid/cugrid.cu
@@ -0,0 +1,193 @@
+/* CUDA kernels + C launcher для grid-композиции NV12 кадров.
+ *
+ * Извлечено из vf_cuda_grid (FFmpeg out-of-tree filter), LGPL-2.1+.
+ *
+ * Все операции работают на NV12 кадрах:
+ *   Y plane:  1 byte/pixel
+ *   UV plane: 2 bytes/UV-pair (interleaved U,V), pitch обычно == pitch_y
+ *
+ * Стандартный block size: 16×16 = 256 threads, кратно warp 32.
+ *
+ * Lifecycle:
+ *   cfc_cugrid_init() один раз при старте композитора (no-op для CUDA C++
+ *   runtime kernels, зарезервировано на cuModuleLoad если перейдём на PTX)
+ *
+ * Лицензия: LGPL-2.1+
+ */
+
+#include "../../include/cuframes_composer/cugrid.h"
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+#define CFC_BLOCK_X 16
+#define CFC_BLOCK_Y 16
+
+#define CHECK_CUDA(call) do { \
+    cudaError_t _e = (call); \
+    if (_e != cudaSuccess) { \
+        fprintf(stderr, "[cfc/cugrid] %s:%d %s → %s\n", \
+                __FILE__, __LINE__, #call, cudaGetErrorString(_e)); \
+        return -1; \
+    } \
+} while (0)
+
+/* ── Kernels ──────────────────────────────────────────────────────────── */
+
+__global__ static void cfc_kern_fill_y(unsigned char *dst, int dst_pitch,
+                                         int rx, int ry, int rw, int rh,
+                                         int fill, int alpha)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= rw || y >= rh) return;
+    unsigned char *p = dst + (ry + y) * dst_pitch + (rx + x);
+    int cur = *p;
+    *p = (unsigned char)((fill * alpha + cur * (255 - alpha)) / 255);
+}
+
+__global__ static void cfc_kern_fill_uv(unsigned char *dst, int dst_pitch,
+                                          int rx, int ry, int rw, int rh,
+                                          int fill_u, int fill_v, int alpha)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= rw || y >= rh) return;
+    unsigned char *p = dst + (ry + y) * dst_pitch + (rx + x) * 2;
+    int cu = p[0], cv = p[1];
+    p[0] = (unsigned char)((fill_u * alpha + cu * (255 - alpha)) / 255);
+    p[1] = (unsigned char)((fill_v * alpha + cv * (255 - alpha)) / 255);
+}
+
+__global__ static void cfc_kern_resize_y(
+    const unsigned char *src, int src_w, int src_h, int src_pitch,
+    unsigned char *dst, int dst_x, int dst_y, int dst_w, int dst_h, int dst_pitch)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= dst_w || y >= dst_h) return;
+
+    float fx = ((float)x + 0.5f) * src_w / dst_w - 0.5f;
+    float fy = ((float)y + 0.5f) * src_h / dst_h - 0.5f;
+    int x0 = max(0, (int)floorf(fx)), x1 = min(src_w - 1, x0 + 1);
+    int y0 = max(0, (int)floorf(fy)), y1 = min(src_h - 1, y0 + 1);
+    float wx = fx - x0, wy = fy - y0;
+
+    float p00 = (float)src[y0 * src_pitch + x0];
+    float p01 = (float)src[y0 * src_pitch + x1];
+    float p10 = (float)src[y1 * src_pitch + x0];
+    float p11 = (float)src[y1 * src_pitch + x1];
+    float v = (1 - wx) * (1 - wy) * p00 + wx * (1 - wy) * p01
+            + (1 - wx) * wy * p10 + wx * wy * p11;
+
+    dst[(dst_y + y) * dst_pitch + (dst_x + x)] = (unsigned char)v;
+}
+
+__global__ static void cfc_kern_resize_uv(
+    const unsigned char *src, int src_w, int src_h, int src_pitch,
+    unsigned char *dst, int dst_x, int dst_y, int dst_w, int dst_h, int dst_pitch)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= dst_w || y >= dst_h) return;
+
+    float fx = ((float)x + 0.5f) * src_w / dst_w - 0.5f;
+    float fy = ((float)y + 0.5f) * src_h / dst_h - 0.5f;
+    int x0 = max(0, (int)floorf(fx)), x1 = min(src_w - 1, x0 + 1);
+    int y0 = max(0, (int)floorf(fy)), y1 = min(src_h - 1, y0 + 1);
+    float wx = fx - x0, wy = fy - y0;
+
+    float u00 = (float)src[y0 * src_pitch + x0 * 2 + 0];
+    float v00 = (float)src[y0 * src_pitch + x0 * 2 + 1];
+    float u01 = (float)src[y0 * src_pitch + x1 * 2 + 0];
+    float v01 = (float)src[y0 * src_pitch + x1 * 2 + 1];
+    float u10 = (float)src[y1 * src_pitch + x0 * 2 + 0];
+    float v10 = (float)src[y1 * src_pitch + x0 * 2 + 1];
+    float u11 = (float)src[y1 * src_pitch + x1 * 2 + 0];
+    float v11 = (float)src[y1 * src_pitch + x1 * 2 + 1];
+
+    float u = (1 - wx) * (1 - wy) * u00 + wx * (1 - wy) * u01
+            + (1 - wx) * wy * u10 + wx * wy * u11;
+    float v = (1 - wx) * (1 - wy) * v00 + wx * (1 - wy) * v01
+            + (1 - wx) * wy * v10 + wx * wy * v11;
+
+    int idx = (dst_y + y) * dst_pitch + (dst_x + x) * 2;
+    dst[idx + 0] = (unsigned char)u;
+    dst[idx + 1] = (unsigned char)v;
+}
+
+/* ── C launcher API ───────────────────────────────────────────────────── */
+
+extern "C" {
+
+int cfc_cugrid_init(void)
+{
+    return 0;
+}
+
+int cfc_cugrid_fill_nv12(
+    CUstream stream,
+    CUdeviceptr dst_y, int pitch_y,
+    CUdeviceptr dst_uv, int pitch_uv,
+    int x, int y, int w, int h,
+    int color_y, int color_u, int color_v, int alpha)
+{
+    if (w <= 0 || h <= 0) return 0;
+    if (alpha <= 0) return 0;
+    if (alpha > 255) alpha = 255;
+
+    dim3 block_y_(CFC_BLOCK_X, CFC_BLOCK_Y);
+    dim3 grid_y_((w + CFC_BLOCK_X - 1) / CFC_BLOCK_X,
+                 (h + CFC_BLOCK_Y - 1) / CFC_BLOCK_Y);
+    cfc_kern_fill_y<<<grid_y_, block_y_, 0, (cudaStream_t)stream>>>(
+        (unsigned char *)dst_y, pitch_y,
+        x, y, w, h, color_y, alpha);
+    CHECK_CUDA(cudaGetLastError());
+
+    int uv_x = x / 2, uv_y = y / 2;
+    int uv_w = w / 2, uv_h = h / 2;
+    dim3 block_uv_(CFC_BLOCK_X, CFC_BLOCK_Y);
+    dim3 grid_uv_((uv_w + CFC_BLOCK_X - 1) / CFC_BLOCK_X,
+                  (uv_h + CFC_BLOCK_Y - 1) / CFC_BLOCK_Y);
+    cfc_kern_fill_uv<<<grid_uv_, block_uv_, 0, (cudaStream_t)stream>>>(
+        (unsigned char *)dst_uv, pitch_uv,
+        uv_x, uv_y, uv_w, uv_h, color_u, color_v, alpha);
+    CHECK_CUDA(cudaGetLastError());
+
+    return 0;
+}
+
+int cfc_cugrid_resize_nv12(
+    CUstream stream,
+    CUdeviceptr src_y, int src_w, int src_h, int src_pitch_y,
+    CUdeviceptr src_uv,                       int src_pitch_uv,
+    CUdeviceptr dst_y, int dst_pitch_y,
+    CUdeviceptr dst_uv, int dst_pitch_uv,
+    int dst_x, int dst_y_off, int dst_w, int dst_h)
+{
+    if (dst_w <= 0 || dst_h <= 0) return 0;
+    if (src_w <= 0 || src_h <= 0) return -1;
+
+    dim3 block_y_(CFC_BLOCK_X, CFC_BLOCK_Y);
+    dim3 grid_y_((dst_w + CFC_BLOCK_X - 1) / CFC_BLOCK_X,
+                 (dst_h + CFC_BLOCK_Y - 1) / CFC_BLOCK_Y);
+    cfc_kern_resize_y<<<grid_y_, block_y_, 0, (cudaStream_t)stream>>>(
+        (const unsigned char *)src_y, src_w, src_h, src_pitch_y,
+        (unsigned char *)dst_y, dst_x, dst_y_off, dst_w, dst_h, dst_pitch_y);
+    CHECK_CUDA(cudaGetLastError());
+
+    int uv_src_w = src_w / 2, uv_src_h = src_h / 2;
+    int uv_dst_w = dst_w / 2, uv_dst_h = dst_h / 2;
+    int uv_dst_x = dst_x / 2, uv_dst_y = dst_y_off / 2;
+    dim3 block_uv_(CFC_BLOCK_X, CFC_BLOCK_Y);
+    dim3 grid_uv_((uv_dst_w + CFC_BLOCK_X - 1) / CFC_BLOCK_X,
+                  (uv_dst_h + CFC_BLOCK_Y - 1) / CFC_BLOCK_Y);
+    cfc_kern_resize_uv<<<grid_uv_, block_uv_, 0, (cudaStream_t)stream>>>(
+        (const unsigned char *)src_uv, uv_src_w, uv_src_h, src_pitch_uv,
+        (unsigned char *)dst_uv, uv_dst_x, uv_dst_y, uv_dst_w, uv_dst_h, dst_pitch_uv);
+    CHECK_CUDA(cudaGetLastError());
+
+    return 0;
+}
+
+}  /* extern "C" */