vf_cuda_grid: Phase 2a — layout templates + dynamic nb_inputs

Layout templates (9): - single, dual_horizontal, dual_vertical - quad (default), main_plus_preview (1 big + 3 small) - six_grid (3x2), nine_grid (3x3), sixteen_grid (4x4) - panoramic Cells определены в normalized координатах (0.0-1.0), переводятся в pixels в config_output (× out_w/out_h). Alignment до chroma boundary (NV12 ÷ 2). Filter options: - layout=<name> (default quad) - out_w=<int> (default 1920) - out_h=<int> (default 1080) Dynamic inputs: - nb_inputs derived из layout (single=1, quad=4, nine_grid=9, sixteen_grid=16) - ff_append_inpad_free_name в init() для каждой cell - AVFILTER_FLAG_DYNAMIC_INPUTS на filter Phase 2a limitation: - Каждый input должен быть точно cell_px size (no scaling). - Phase 2b добавит NPP resize для mixed-size inputs.
2026-05-19 20:57:08 +01:00
parent 4313c3f30d
commit 6ee2f474c7
1 changed files with 176 additions and 78 deletions
@@ -4,12 +4,13 @@
 * Принимает N CUDA-frames на входе, выдаёт один composed frame с N cells
 * в layout. End-to-end CUDA (без CPU round-trip).
 *
- * Phase 1 (MVP): fixed quad layout 2×2, 4 NV12-inputs одинакового размера,
- * output size = 2W × 2H, без scaling. Композиция через cuMemcpy2DAsync per
- * Y/UV plane на каждый input → soответствующую quadrant'у output.
+ * Phase 2a: layout templates (single/dual_h/dual_v/quad/main_plus_preview/
+ * six_grid/nine_grid/sixteen_grid/panoramic), dynamic nb_inputs, output size
+ * через option (default 1920×1080). Cell rects = normalized × output size.
+ * **Scaling пока нет** — каждый input должен быть точно cell size (Phase 2b NPP).
 *
 * Future phases (см. gx/vf-cuda-grid#1):
- *  - Phase 2: dynamic layouts + per-cell scaling
+ *  - Phase 2b: per-cell scaling через libnpp (mixed-size inputs)
 *  - Phase 3: runtime layout switching через process_command (ZMQ)
 *  - Phase 4+: overlay primitives (rect/text/icon/image/dim/graph/chat)
 *
@@ -34,28 +35,115 @@
 #include "video.h"

 #define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x)
+#define MAX_CELLS 16

-#define CUDA_GRID_INPUTS 4  /* Phase 1: fixed quad */
+/* ─── Layout templates (normalized координаты 0.0–1.0) ─────────────────── */
+
+typedef struct LayoutCell {
+    float x, y, w, h;   /* normalized fraction of output size */
+} LayoutCell;
+
+typedef struct LayoutTemplate {
+    const char *name;
+    int nb_cells;
+    LayoutCell cells[MAX_CELLS];
+} LayoutTemplate;
+
+/* Layouts copy-paste'нуты по структуре cctv-processor config/grids.json. */
+static const LayoutTemplate layouts[] = {
+    {
+        "single", 1,
+        { {0.0f, 0.0f, 1.0f, 1.0f} }
+    },
+    {
+        "dual_horizontal", 2,
+        { {0.0f, 0.0f, 0.5f, 1.0f}, {0.5f, 0.0f, 0.5f, 1.0f} }
+    },
+    {
+        "dual_vertical", 2,
+        { {0.0f, 0.0f, 1.0f, 0.5f}, {0.0f, 0.5f, 1.0f, 0.5f} }
+    },
+    {
+        "quad", 4,
+        {
+            {0.0f, 0.0f, 0.5f, 0.5f}, {0.5f, 0.0f, 0.5f, 0.5f},
+            {0.0f, 0.5f, 0.5f, 0.5f}, {0.5f, 0.5f, 0.5f, 0.5f},
+        }
+    },
+    {
+        /* Main camera 2/3 width, 3 small cameras stacked справа сверху вниз */
+        "main_plus_preview", 4,
+        {
+            {0.0f,      0.0f, 2.0f/3, 1.0f},
+            {2.0f/3,    0.0f, 1.0f/3, 1.0f/3},
+            {2.0f/3, 1.0f/3, 1.0f/3, 1.0f/3},
+            {2.0f/3, 2.0f/3, 1.0f/3, 1.0f/3},
+        }
+    },
+    {
+        "six_grid", 6,
+        {
+            {0.0f, 0.0f, 1.0f/3, 0.5f}, {1.0f/3, 0.0f, 1.0f/3, 0.5f}, {2.0f/3, 0.0f, 1.0f/3, 0.5f},
+            {0.0f, 0.5f, 1.0f/3, 0.5f}, {1.0f/3, 0.5f, 1.0f/3, 0.5f}, {2.0f/3, 0.5f, 1.0f/3, 0.5f},
+        }
+    },
+    {
+        "nine_grid", 9,
+        {
+            {0.0f, 0.0f, 1.0f/3, 1.0f/3}, {1.0f/3, 0.0f, 1.0f/3, 1.0f/3}, {2.0f/3, 0.0f, 1.0f/3, 1.0f/3},
+            {0.0f, 1.0f/3, 1.0f/3, 1.0f/3}, {1.0f/3, 1.0f/3, 1.0f/3, 1.0f/3}, {2.0f/3, 1.0f/3, 1.0f/3, 1.0f/3},
+            {0.0f, 2.0f/3, 1.0f/3, 1.0f/3}, {1.0f/3, 2.0f/3, 1.0f/3, 1.0f/3}, {2.0f/3, 2.0f/3, 1.0f/3, 1.0f/3},
+        }
+    },
+    {
+        "sixteen_grid", 16,
+        {
+            {0.00f, 0.00f, 0.25f, 0.25f}, {0.25f, 0.00f, 0.25f, 0.25f}, {0.50f, 0.00f, 0.25f, 0.25f}, {0.75f, 0.00f, 0.25f, 0.25f},
+            {0.00f, 0.25f, 0.25f, 0.25f}, {0.25f, 0.25f, 0.25f, 0.25f}, {0.50f, 0.25f, 0.25f, 0.25f}, {0.75f, 0.25f, 0.25f, 0.25f},
+            {0.00f, 0.50f, 0.25f, 0.25f}, {0.25f, 0.50f, 0.25f, 0.25f}, {0.50f, 0.50f, 0.25f, 0.25f}, {0.75f, 0.50f, 0.25f, 0.25f},
+            {0.00f, 0.75f, 0.25f, 0.25f}, {0.25f, 0.75f, 0.25f, 0.25f}, {0.50f, 0.75f, 0.25f, 0.25f}, {0.75f, 0.75f, 0.25f, 0.25f},
+        }
+    },
+    {
+        /* Один widescreen panoramic — 1 cell full width */
+        "panoramic", 1,
+        { {0.0f, 0.0f, 1.0f, 1.0f} }
+    },
+};
+
+static const LayoutTemplate *find_layout(const char *name)
+{
+    for (size_t i = 0; i < FF_ARRAY_ELEMS(layouts); i++) {
+        if (!strcmp(layouts[i].name, name))
+            return &layouts[i];
+    }
+    return NULL;
+}
+
+/* ─── Filter state ─────────────────────────────────────────────────────── */

 typedef struct CudaGridContext {
    const AVClass *class;

-    AVBufferRef *hw_device_ctx;
+    /* Options */
+    char *layout_name;
+    int   out_width;
+    int   out_height;
+
+    /* Resolved layout (после init) */
+    const LayoutTemplate *layout;
+
+    /* CUDA */
    AVCUDADeviceContext *hwctx;
    CUcontext cu_ctx;
    CUstream  cu_stream;

    FFFrameSync fs;

-    /* Output dimensions (computed in config_output) */
-    int out_width;
-    int out_height;
-
-    /* Per-cell target rectangles в output frame.
-     * Phase 1 hardcode: 4 ячейки 2×2 (top-left, top-right, bottom-left, bottom-right). */
+    /* Per-cell pixel rects (computed в config_output из normalized × out size) */
    struct {
        int x, y, w, h;
-    } cells[CUDA_GRID_INPUTS];
+    } cell_px[MAX_CELLS];
 } CudaGridContext;

 /* ─── Composition: copy одного input plane в target region output ──────── */
@@ -87,7 +175,7 @@ static int copy_input_plane(AVFilterContext *ctx,
    return CHECK_CU(s->hwctx->internal->cuda_dl->cuMemcpy2DAsync(&cpy, s->cu_stream));
 }

-/* ─── Framesync callback — N frames аre ready, compose ────────────────── */
+/* ─── Framesync callback ──────────────────────────────────────────────── */

 static int cuda_grid_compose(FFFrameSync *fs)
 {
@@ -95,12 +183,12 @@ static int cuda_grid_compose(FFFrameSync *fs)
    AVFilterLink    *outlink = ctx->outputs[0];
    CudaGridContext *s       = ctx->priv;
    AVFrame *out = NULL;
-    AVFrame *in[CUDA_GRID_INPUTS] = {0};
+    AVFrame *in[MAX_CELLS] = {0};
    CUcontext dummy;
-    int ret;
+    int i, ret;
+    int nb = s->layout->nb_cells;

-    /* Сбор всех N input frames из framesync */
-    for (int i = 0; i < CUDA_GRID_INPUTS; i++) {
+    for (i = 0; i < nb; i++) {
        ret = ff_framesync_get_frame(fs, i, &in[i], 0);
        if (ret < 0)
            return ret;
@@ -110,43 +198,38 @@ static int cuda_grid_compose(FFFrameSync *fs)
        }
    }

-    /* Output frame из output's hw_frames_pool */
    out = ff_get_video_buffer(outlink, s->out_width, s->out_height);
    if (!out)
        return AVERROR(ENOMEM);

-    /* Copy props (timestamps, color metadata) от первого input */
    ret = av_frame_copy_props(out, in[0]);
    if (ret < 0)
        goto fail;
    out->width  = s->out_width;
    out->height = s->out_height;

-    /* CUDA context push для всех cuMemcpy в этом filter call */
    ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPushCurrent(s->cu_ctx));
    if (ret < 0)
        goto fail;

-    /* Для каждого input — copy Y plane + UV plane в свою quadrant.
-     * NV12 layout: data[0] = Y, data[1] = UV interleaved. linesize[0/1] = pitch. */
-    for (int i = 0; i < CUDA_GRID_INPUTS; i++) {
+    for (i = 0; i < nb; i++) {
        AVFrame *src = in[i];
-        int cx = s->cells[i].x;
-        int cy = s->cells[i].y;
-        int cw = s->cells[i].w;
-        int ch = s->cells[i].h;
+        int cx = s->cell_px[i].x;
+        int cy = s->cell_px[i].y;
+        int cw = s->cell_px[i].w;
+        int ch = s->cell_px[i].h;

        if (src->width != cw || src->height != ch) {
            av_log(ctx, AV_LOG_ERROR,
-                   "input %d size %dx%d != expected cell size %dx%d "
-                   "(Phase 1: no scaling, all inputs must match cell size)\n",
+                   "input %d size %dx%d != cell size %dx%d "
+                   "(Phase 2a: no scaling — Phase 2b добавит NPP resize)\n",
                   i, src->width, src->height, cw, ch);
            CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
            ret = AVERROR(EINVAL);
            goto fail;
        }

-        /* Y plane (full resolution, 1 byte per pixel) */
+        /* Y plane */
        ret = copy_input_plane(ctx,
                               (CUdeviceptr)src->data[0], src->linesize[0],
                               src->width, src->height,
@@ -157,7 +240,7 @@ static int cuda_grid_compose(FFFrameSync *fs)
            goto fail;
        }

-        /* UV plane (half resolution для NV12, но 2 bytes per "pixel" — interleaved UV) */
+        /* UV plane (NV12: половинное разрешение, 2 bytes per "pixel") */
        ret = copy_input_plane(ctx,
                               (CUdeviceptr)src->data[1], src->linesize[1],
                               src->width / 2, src->height / 2,
@@ -170,7 +253,6 @@ static int cuda_grid_compose(FFFrameSync *fs)
    }

    CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
-
    return ff_filter_frame(outlink, out);

 fail:
@@ -178,14 +260,38 @@ fail:
    return ret;
 }

-/* ─── Lifecycle: init / uninit / query_formats / config_input / config_output ─ */
+/* ─── Lifecycle ────────────────────────────────────────────────────────── */

 static av_cold int cuda_grid_init(AVFilterContext *ctx)
 {
    CudaGridContext *s = ctx->priv;
-    /* Сами inputs регистрируем в filter struct (см. внизу — нельзя AVFILTER_INPUT_COUNT_MAX
-     * без явных AVFilterPad'ов). Phase 1 fix=4. */
-    (void)s;
+    const LayoutTemplate *lt;
+    int i, ret;
+
+    lt = find_layout(s->layout_name);
+    if (!lt) {
+        av_log(ctx, AV_LOG_ERROR, "unknown layout '%s'. Доступны: ", s->layout_name);
+        for (i = 0; i < (int)FF_ARRAY_ELEMS(layouts); i++)
+            av_log(ctx, AV_LOG_ERROR, "%s ", layouts[i].name);
+        av_log(ctx, AV_LOG_ERROR, "\n");
+        return AVERROR(EINVAL);
+    }
+    s->layout = lt;
+
+    /* Dynamic inputs — append pad per cell */
+    for (i = 0; i < lt->nb_cells; i++) {
+        AVFilterPad pad = { 0 };
+        pad.type = AVMEDIA_TYPE_VIDEO;
+        pad.name = av_asprintf("input%d", i);
+        if (!pad.name)
+            return AVERROR(ENOMEM);
+        ret = ff_append_inpad_free_name(ctx, &pad);
+        if (ret < 0)
+            return ret;
+    }
+
+    av_log(ctx, AV_LOG_INFO, "cuda_grid layout=%s cells=%d output=%dx%d\n",
+           lt->name, lt->nb_cells, s->out_width, s->out_height);
    return 0;
 }

@@ -193,7 +299,6 @@ static av_cold void cuda_grid_uninit(AVFilterContext *ctx)
 {
    CudaGridContext *s = ctx->priv;
    ff_framesync_uninit(&s->fs);
-    av_buffer_unref(&s->hw_device_ctx);
 }

 static int cuda_grid_config_input(AVFilterLink *inlink)
@@ -220,7 +325,7 @@ static int cuda_grid_config_output(AVFilterLink *outlink)
    AVHWDeviceContext   *hwdev;
    AVBufferRef         *out_ref;
    AVHWFramesContext   *out_hwfc;
-    int W, H, ret;
+    int i, ret;

    if (!inl0->hw_frames_ctx)
        return AVERROR(EINVAL);
@@ -228,15 +333,28 @@ static int cuda_grid_config_output(AVFilterLink *outlink)

    if (hwfc0->sw_format != AV_PIX_FMT_NV12) {
        av_log(ctx, AV_LOG_ERROR,
-               "Phase 1 supports only NV12, got %s\n",
+               "Phase 1-2a поддерживают только NV12, got %s\n",
               av_get_pix_fmt_name(hwfc0->sw_format));
        return AVERROR(EINVAL);
    }

-    /* Все inputs должны иметь одинаковый device и sw_format (Phase 1 also same size) */
-    W = in0->w;
-    H = in0->h;
-    for (int i = 1; i < CUDA_GRID_INPUTS; i++) {
+    /* Compute pixel rects из normalized layout × output size */
+    for (i = 0; i < s->layout->nb_cells; i++) {
+        s->cell_px[i].x = (int)(s->layout->cells[i].x * s->out_width);
+        s->cell_px[i].y = (int)(s->layout->cells[i].y * s->out_height);
+        s->cell_px[i].w = (int)(s->layout->cells[i].w * s->out_width);
+        s->cell_px[i].h = (int)(s->layout->cells[i].h * s->out_height);
+        /* Align до chroma boundary (NV12 → 2x2) */
+        s->cell_px[i].x &= ~1;
+        s->cell_px[i].y &= ~1;
+        s->cell_px[i].w &= ~1;
+        s->cell_px[i].h &= ~1;
+        av_log(ctx, AV_LOG_VERBOSE, "  cell[%d] = %dx%d @ (%d,%d)\n",
+               i, s->cell_px[i].w, s->cell_px[i].h, s->cell_px[i].x, s->cell_px[i].y);
+    }
+
+    /* Validate все inputs: device match + sw_format match */
+    for (i = 1; i < s->layout->nb_cells; i++) {
        AVFilterLink *inN = ctx->inputs[i];
        FilterLink   *ilN = ff_filter_link(inN);
        AVHWFramesContext *hN;
@@ -251,34 +369,16 @@ static int cuda_grid_config_output(AVFilterLink *outlink)
            av_log(ctx, AV_LOG_ERROR, "input %d sw_format mismatch\n", i);
            return AVERROR(EINVAL);
        }
-        if (inN->w != W || inN->h != H) {
-            av_log(ctx, AV_LOG_ERROR,
-                   "Phase 1: input %d size %dx%d != input 0 size %dx%d. "
-                   "В этой фазе scaling не поддерживается, все inputs должны быть одного размера.\n",
-                   i, inN->w, inN->h, W, H);
-            return AVERROR(EINVAL);
-        }
    }

-    /* Output = 2W × 2H для quad layout */
-    s->out_width  = 2 * W;
-    s->out_height = 2 * H;
    outlink->w = s->out_width;
    outlink->h = s->out_height;

-    /* Hardcoded quad cell positions */
-    s->cells[0].x = 0;     s->cells[0].y = 0;     s->cells[0].w = W; s->cells[0].h = H;
-    s->cells[1].x = W;     s->cells[1].y = 0;     s->cells[1].w = W; s->cells[1].h = H;
-    s->cells[2].x = 0;     s->cells[2].y = H;     s->cells[2].w = W; s->cells[2].h = H;
-    s->cells[3].x = W;     s->cells[3].y = H;     s->cells[3].w = W; s->cells[3].h = H;
-
-    /* Setup CUDA device + stream context из input 0 */
    hwdev = hwfc0->device_ctx;
    s->hwctx    = (AVCUDADeviceContext *)hwdev->hwctx;
    s->cu_ctx   = s->hwctx->cuda_ctx;
    s->cu_stream = s->hwctx->stream;

-    /* Аллокация output hw_frames_ctx — copy от input #0 с обновлёнными размерами */
    out_ref = av_hwframe_ctx_alloc(hwfc0->device_ref);
    if (!out_ref)
        return AVERROR(ENOMEM);
@@ -296,14 +396,12 @@ static int cuda_grid_config_output(AVFilterLink *outlink)
    }
    outl->hw_frames_ctx = out_ref;

-    /* Setup framesync для lock-step pull от N inputs */
-    ret = ff_framesync_init(&s->fs, ctx, CUDA_GRID_INPUTS);
+    ret = ff_framesync_init(&s->fs, ctx, s->layout->nb_cells);
    if (ret < 0)
        return ret;
    {
-        int i;
        FFFrameSyncIn *fs_in;
-        for (i = 0; i < CUDA_GRID_INPUTS; i++) {
+        for (i = 0; i < s->layout->nb_cells; i++) {
            fs_in = &s->fs.in[i];
            fs_in->time_base = ctx->inputs[i]->time_base;
            fs_in->sync   = 1;
@@ -315,7 +413,6 @@ static int cuda_grid_config_output(AVFilterLink *outlink)
    s->fs.on_event = cuda_grid_compose;

    outlink->time_base = ctx->inputs[0]->time_base;
-
    return ff_framesync_configure(&s->fs);
 }

@@ -325,10 +422,18 @@ static int cuda_grid_activate(AVFilterContext *ctx)
    return ff_framesync_activate(&s->fs);
 }

-/* ─── Filter registration ──────────────────────────────────────────────── */
+/* ─── Options + registration ───────────────────────────────────────────── */
+
+#define OFFSET(x) offsetof(CudaGridContext, x)
+#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)

 static const AVOption cuda_grid_options[] = {
-    /* Phase 1: no options. Phase 2 добавит `layout=`. */
+    { "layout", "имя layout template",
+      OFFSET(layout_name), AV_OPT_TYPE_STRING, { .str = "quad" }, 0, 0, FLAGS },
+    { "out_w", "ширина output frame в пикселях",
+      OFFSET(out_width),   AV_OPT_TYPE_INT,    { .i64 = 1920 }, 16, 16384, FLAGS },
+    { "out_h", "высота output frame в пикселях",
+      OFFSET(out_height),  AV_OPT_TYPE_INT,    { .i64 = 1080 }, 16, 16384, FLAGS },
    { NULL }
 };

@@ -340,13 +445,6 @@ static const AVClass cuda_grid_class = {
    .category   = AV_CLASS_CATEGORY_FILTER,
 };

-static const AVFilterPad cuda_grid_inputs[] = {
-    { .name = "input0", .type = AVMEDIA_TYPE_VIDEO, .config_props = cuda_grid_config_input },
-    { .name = "input1", .type = AVMEDIA_TYPE_VIDEO, .config_props = cuda_grid_config_input },
-    { .name = "input2", .type = AVMEDIA_TYPE_VIDEO, .config_props = cuda_grid_config_input },
-    { .name = "input3", .type = AVMEDIA_TYPE_VIDEO, .config_props = cuda_grid_config_input },
-};
-
 static const AVFilterPad cuda_grid_outputs[] = {
    { .name = "default", .type = AVMEDIA_TYPE_VIDEO, .config_props = cuda_grid_config_output },
 };
@@ -359,9 +457,9 @@ const AVFilter ff_vf_cuda_grid = {
    .init           = cuda_grid_init,
    .uninit         = cuda_grid_uninit,
    .activate       = cuda_grid_activate,
-    FILTER_INPUTS(cuda_grid_inputs),
+    /* No FILTER_INPUTS — pads added dynamically в init() per layout. */
    FILTER_OUTPUTS(cuda_grid_outputs),
    FILTER_SINGLE_PIXFMT(AV_PIX_FMT_CUDA),
-    .flags          = AVFILTER_FLAG_HWDEVICE,
+    .flags          = AVFILTER_FLAG_HWDEVICE | AVFILTER_FLAG_DYNAMIC_INPUTS,
    .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
 };