vf_cuda_grid: Phase 2a — layout templates + dynamic nb_inputs

Layout templates (9):
- single, dual_horizontal, dual_vertical
- quad (default), main_plus_preview (1 big + 3 small)
- six_grid (3x2), nine_grid (3x3), sixteen_grid (4x4)
- panoramic

Cells определены в normalized координатах (0.0-1.0), переводятся в pixels
в config_output (× out_w/out_h). Alignment до chroma boundary (NV12 ÷ 2).

Filter options:
- layout=<name> (default quad)
- out_w=<int> (default 1920)
- out_h=<int> (default 1080)

Dynamic inputs:
- nb_inputs derived из layout (single=1, quad=4, nine_grid=9, sixteen_grid=16)
- ff_append_inpad_free_name в init() для каждой cell
- AVFILTER_FLAG_DYNAMIC_INPUTS на filter

Phase 2a limitation:
- Каждый input должен быть точно cell_px size (no scaling).
- Phase 2b добавит NPP resize для mixed-size inputs.
This commit is contained in:
gx
2026-05-19 20:57:08 +01:00
parent 4313c3f30d
commit 6ee2f474c7
+176 -78
View File
@@ -4,12 +4,13 @@
* Принимает N CUDA-frames на входе, выдаёт один composed frame с N cells
* в layout. End-to-end CUDA (без CPU round-trip).
*
* Phase 1 (MVP): fixed quad layout 2×2, 4 NV12-inputs одинакового размера,
* output size = 2W × 2H, без scaling. Композиция через cuMemcpy2DAsync per
* Y/UV plane на каждый input → soответствующую quadrant'у output.
* Phase 2a: layout templates (single/dual_h/dual_v/quad/main_plus_preview/
* six_grid/nine_grid/sixteen_grid/panoramic), dynamic nb_inputs, output size
* через option (default 1920×1080). Cell rects = normalized × output size.
* **Scaling пока нет** — каждый input должен быть точно cell size (Phase 2b NPP).
*
* Future phases (см. gx/vf-cuda-grid#1):
* - Phase 2: dynamic layouts + per-cell scaling
* - Phase 2b: per-cell scaling через libnpp (mixed-size inputs)
* - Phase 3: runtime layout switching через process_command (ZMQ)
* - Phase 4+: overlay primitives (rect/text/icon/image/dim/graph/chat)
*
@@ -34,28 +35,115 @@
#include "video.h"
#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x)
#define MAX_CELLS 16
#define CUDA_GRID_INPUTS 4 /* Phase 1: fixed quad */
/* ─── Layout templates (normalized координаты 0.0–1.0) ─────────────────── */
typedef struct LayoutCell {
float x, y, w, h; /* normalized fraction of output size */
} LayoutCell;
typedef struct LayoutTemplate {
const char *name;
int nb_cells;
LayoutCell cells[MAX_CELLS];
} LayoutTemplate;
/* Layouts copy-paste'нуты по структуре cctv-processor config/grids.json. */
static const LayoutTemplate layouts[] = {
{
"single", 1,
{ {0.0f, 0.0f, 1.0f, 1.0f} }
},
{
"dual_horizontal", 2,
{ {0.0f, 0.0f, 0.5f, 1.0f}, {0.5f, 0.0f, 0.5f, 1.0f} }
},
{
"dual_vertical", 2,
{ {0.0f, 0.0f, 1.0f, 0.5f}, {0.0f, 0.5f, 1.0f, 0.5f} }
},
{
"quad", 4,
{
{0.0f, 0.0f, 0.5f, 0.5f}, {0.5f, 0.0f, 0.5f, 0.5f},
{0.0f, 0.5f, 0.5f, 0.5f}, {0.5f, 0.5f, 0.5f, 0.5f},
}
},
{
/* Main camera 2/3 width, 3 small cameras stacked справа сверху вниз */
"main_plus_preview", 4,
{
{0.0f, 0.0f, 2.0f/3, 1.0f},
{2.0f/3, 0.0f, 1.0f/3, 1.0f/3},
{2.0f/3, 1.0f/3, 1.0f/3, 1.0f/3},
{2.0f/3, 2.0f/3, 1.0f/3, 1.0f/3},
}
},
{
"six_grid", 6,
{
{0.0f, 0.0f, 1.0f/3, 0.5f}, {1.0f/3, 0.0f, 1.0f/3, 0.5f}, {2.0f/3, 0.0f, 1.0f/3, 0.5f},
{0.0f, 0.5f, 1.0f/3, 0.5f}, {1.0f/3, 0.5f, 1.0f/3, 0.5f}, {2.0f/3, 0.5f, 1.0f/3, 0.5f},
}
},
{
"nine_grid", 9,
{
{0.0f, 0.0f, 1.0f/3, 1.0f/3}, {1.0f/3, 0.0f, 1.0f/3, 1.0f/3}, {2.0f/3, 0.0f, 1.0f/3, 1.0f/3},
{0.0f, 1.0f/3, 1.0f/3, 1.0f/3}, {1.0f/3, 1.0f/3, 1.0f/3, 1.0f/3}, {2.0f/3, 1.0f/3, 1.0f/3, 1.0f/3},
{0.0f, 2.0f/3, 1.0f/3, 1.0f/3}, {1.0f/3, 2.0f/3, 1.0f/3, 1.0f/3}, {2.0f/3, 2.0f/3, 1.0f/3, 1.0f/3},
}
},
{
"sixteen_grid", 16,
{
{0.00f, 0.00f, 0.25f, 0.25f}, {0.25f, 0.00f, 0.25f, 0.25f}, {0.50f, 0.00f, 0.25f, 0.25f}, {0.75f, 0.00f, 0.25f, 0.25f},
{0.00f, 0.25f, 0.25f, 0.25f}, {0.25f, 0.25f, 0.25f, 0.25f}, {0.50f, 0.25f, 0.25f, 0.25f}, {0.75f, 0.25f, 0.25f, 0.25f},
{0.00f, 0.50f, 0.25f, 0.25f}, {0.25f, 0.50f, 0.25f, 0.25f}, {0.50f, 0.50f, 0.25f, 0.25f}, {0.75f, 0.50f, 0.25f, 0.25f},
{0.00f, 0.75f, 0.25f, 0.25f}, {0.25f, 0.75f, 0.25f, 0.25f}, {0.50f, 0.75f, 0.25f, 0.25f}, {0.75f, 0.75f, 0.25f, 0.25f},
}
},
{
/* Один widescreen panoramic — 1 cell full width */
"panoramic", 1,
{ {0.0f, 0.0f, 1.0f, 1.0f} }
},
};
static const LayoutTemplate *find_layout(const char *name)
{
for (size_t i = 0; i < FF_ARRAY_ELEMS(layouts); i++) {
if (!strcmp(layouts[i].name, name))
return &layouts[i];
}
return NULL;
}
/* ─── Filter state ─────────────────────────────────────────────────────── */
typedef struct CudaGridContext {
const AVClass *class;
AVBufferRef *hw_device_ctx;
/* Options */
char *layout_name;
int out_width;
int out_height;
/* Resolved layout (после init) */
const LayoutTemplate *layout;
/* CUDA */
AVCUDADeviceContext *hwctx;
CUcontext cu_ctx;
CUstream cu_stream;
FFFrameSync fs;
/* Output dimensions (computed in config_output) */
int out_width;
int out_height;
/* Per-cell target rectangles в output frame.
* Phase 1 hardcode: 4 ячейки 2×2 (top-left, top-right, bottom-left, bottom-right). */
/* Per-cell pixel rects (computed в config_output из normalized × out size) */
struct {
int x, y, w, h;
} cells[CUDA_GRID_INPUTS];
} cell_px[MAX_CELLS];
} CudaGridContext;
/* ─── Composition: copy одного input plane в target region output ──────── */
@@ -87,7 +175,7 @@ static int copy_input_plane(AVFilterContext *ctx,
return CHECK_CU(s->hwctx->internal->cuda_dl->cuMemcpy2DAsync(&cpy, s->cu_stream));
}
/* ─── Framesync callback — N frames аre ready, compose ────────────────── */
/* ─── Framesync callback ──────────────────────────────────────────────── */
static int cuda_grid_compose(FFFrameSync *fs)
{
@@ -95,12 +183,12 @@ static int cuda_grid_compose(FFFrameSync *fs)
AVFilterLink *outlink = ctx->outputs[0];
CudaGridContext *s = ctx->priv;
AVFrame *out = NULL;
AVFrame *in[CUDA_GRID_INPUTS] = {0};
AVFrame *in[MAX_CELLS] = {0};
CUcontext dummy;
int ret;
int i, ret;
int nb = s->layout->nb_cells;
/* Сбор всех N input frames из framesync */
for (int i = 0; i < CUDA_GRID_INPUTS; i++) {
for (i = 0; i < nb; i++) {
ret = ff_framesync_get_frame(fs, i, &in[i], 0);
if (ret < 0)
return ret;
@@ -110,43 +198,38 @@ static int cuda_grid_compose(FFFrameSync *fs)
}
}
/* Output frame из output's hw_frames_pool */
out = ff_get_video_buffer(outlink, s->out_width, s->out_height);
if (!out)
return AVERROR(ENOMEM);
/* Copy props (timestamps, color metadata) от первого input */
ret = av_frame_copy_props(out, in[0]);
if (ret < 0)
goto fail;
out->width = s->out_width;
out->height = s->out_height;
/* CUDA context push для всех cuMemcpy в этом filter call */
ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPushCurrent(s->cu_ctx));
if (ret < 0)
goto fail;
/* Для каждого input — copy Y plane + UV plane в свою quadrant.
* NV12 layout: data[0] = Y, data[1] = UV interleaved. linesize[0/1] = pitch. */
for (int i = 0; i < CUDA_GRID_INPUTS; i++) {
for (i = 0; i < nb; i++) {
AVFrame *src = in[i];
int cx = s->cells[i].x;
int cy = s->cells[i].y;
int cw = s->cells[i].w;
int ch = s->cells[i].h;
int cx = s->cell_px[i].x;
int cy = s->cell_px[i].y;
int cw = s->cell_px[i].w;
int ch = s->cell_px[i].h;
if (src->width != cw || src->height != ch) {
av_log(ctx, AV_LOG_ERROR,
"input %d size %dx%d != expected cell size %dx%d "
"(Phase 1: no scaling, all inputs must match cell size)\n",
"input %d size %dx%d != cell size %dx%d "
"(Phase 2a: no scaling — Phase 2b добавит NPP resize)\n",
i, src->width, src->height, cw, ch);
CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
ret = AVERROR(EINVAL);
goto fail;
}
/* Y plane (full resolution, 1 byte per pixel) */
/* Y plane */
ret = copy_input_plane(ctx,
(CUdeviceptr)src->data[0], src->linesize[0],
src->width, src->height,
@@ -157,7 +240,7 @@ static int cuda_grid_compose(FFFrameSync *fs)
goto fail;
}
/* UV plane (half resolution для NV12, но 2 bytes per "pixel" — interleaved UV) */
/* UV plane (NV12: половинное разрешение, 2 bytes per "pixel") */
ret = copy_input_plane(ctx,
(CUdeviceptr)src->data[1], src->linesize[1],
src->width / 2, src->height / 2,
@@ -170,7 +253,6 @@ static int cuda_grid_compose(FFFrameSync *fs)
}
CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
return ff_filter_frame(outlink, out);
fail:
@@ -178,14 +260,38 @@ fail:
return ret;
}
/* ─── Lifecycle: init / uninit / query_formats / config_input / config_output ─ */
/* ─── Lifecycle ────────────────────────────────────────────────────────── */
static av_cold int cuda_grid_init(AVFilterContext *ctx)
{
CudaGridContext *s = ctx->priv;
/* Сами inputs регистрируем в filter struct (см. внизу — нельзя AVFILTER_INPUT_COUNT_MAX
* без явных AVFilterPad'ов). Phase 1 fix=4. */
(void)s;
const LayoutTemplate *lt;
int i, ret;
lt = find_layout(s->layout_name);
if (!lt) {
av_log(ctx, AV_LOG_ERROR, "unknown layout '%s'. Доступны: ", s->layout_name);
for (i = 0; i < (int)FF_ARRAY_ELEMS(layouts); i++)
av_log(ctx, AV_LOG_ERROR, "%s ", layouts[i].name);
av_log(ctx, AV_LOG_ERROR, "\n");
return AVERROR(EINVAL);
}
s->layout = lt;
/* Dynamic inputs — append pad per cell */
for (i = 0; i < lt->nb_cells; i++) {
AVFilterPad pad = { 0 };
pad.type = AVMEDIA_TYPE_VIDEO;
pad.name = av_asprintf("input%d", i);
if (!pad.name)
return AVERROR(ENOMEM);
ret = ff_append_inpad_free_name(ctx, &pad);
if (ret < 0)
return ret;
}
av_log(ctx, AV_LOG_INFO, "cuda_grid layout=%s cells=%d output=%dx%d\n",
lt->name, lt->nb_cells, s->out_width, s->out_height);
return 0;
}
@@ -193,7 +299,6 @@ static av_cold void cuda_grid_uninit(AVFilterContext *ctx)
{
CudaGridContext *s = ctx->priv;
ff_framesync_uninit(&s->fs);
av_buffer_unref(&s->hw_device_ctx);
}
static int cuda_grid_config_input(AVFilterLink *inlink)
@@ -220,7 +325,7 @@ static int cuda_grid_config_output(AVFilterLink *outlink)
AVHWDeviceContext *hwdev;
AVBufferRef *out_ref;
AVHWFramesContext *out_hwfc;
int W, H, ret;
int i, ret;
if (!inl0->hw_frames_ctx)
return AVERROR(EINVAL);
@@ -228,15 +333,28 @@ static int cuda_grid_config_output(AVFilterLink *outlink)
if (hwfc0->sw_format != AV_PIX_FMT_NV12) {
av_log(ctx, AV_LOG_ERROR,
"Phase 1 supports only NV12, got %s\n",
"Phase 1-2a поддерживают только NV12, got %s\n",
av_get_pix_fmt_name(hwfc0->sw_format));
return AVERROR(EINVAL);
}
/* Все inputs должны иметь одинаковый device и sw_format (Phase 1 also same size) */
W = in0->w;
H = in0->h;
for (int i = 1; i < CUDA_GRID_INPUTS; i++) {
/* Compute pixel rects из normalized layout × output size */
for (i = 0; i < s->layout->nb_cells; i++) {
s->cell_px[i].x = (int)(s->layout->cells[i].x * s->out_width);
s->cell_px[i].y = (int)(s->layout->cells[i].y * s->out_height);
s->cell_px[i].w = (int)(s->layout->cells[i].w * s->out_width);
s->cell_px[i].h = (int)(s->layout->cells[i].h * s->out_height);
/* Align до chroma boundary (NV12 → 2x2) */
s->cell_px[i].x &= ~1;
s->cell_px[i].y &= ~1;
s->cell_px[i].w &= ~1;
s->cell_px[i].h &= ~1;
av_log(ctx, AV_LOG_VERBOSE, " cell[%d] = %dx%d @ (%d,%d)\n",
i, s->cell_px[i].w, s->cell_px[i].h, s->cell_px[i].x, s->cell_px[i].y);
}
/* Validate все inputs: device match + sw_format match */
for (i = 1; i < s->layout->nb_cells; i++) {
AVFilterLink *inN = ctx->inputs[i];
FilterLink *ilN = ff_filter_link(inN);
AVHWFramesContext *hN;
@@ -251,34 +369,16 @@ static int cuda_grid_config_output(AVFilterLink *outlink)
av_log(ctx, AV_LOG_ERROR, "input %d sw_format mismatch\n", i);
return AVERROR(EINVAL);
}
if (inN->w != W || inN->h != H) {
av_log(ctx, AV_LOG_ERROR,
"Phase 1: input %d size %dx%d != input 0 size %dx%d. "
"В этой фазе scaling не поддерживается, все inputs должны быть одного размера.\n",
i, inN->w, inN->h, W, H);
return AVERROR(EINVAL);
}
}
/* Output = 2W × 2H для quad layout */
s->out_width = 2 * W;
s->out_height = 2 * H;
outlink->w = s->out_width;
outlink->h = s->out_height;
/* Hardcoded quad cell positions */
s->cells[0].x = 0; s->cells[0].y = 0; s->cells[0].w = W; s->cells[0].h = H;
s->cells[1].x = W; s->cells[1].y = 0; s->cells[1].w = W; s->cells[1].h = H;
s->cells[2].x = 0; s->cells[2].y = H; s->cells[2].w = W; s->cells[2].h = H;
s->cells[3].x = W; s->cells[3].y = H; s->cells[3].w = W; s->cells[3].h = H;
/* Setup CUDA device + stream context из input 0 */
hwdev = hwfc0->device_ctx;
s->hwctx = (AVCUDADeviceContext *)hwdev->hwctx;
s->cu_ctx = s->hwctx->cuda_ctx;
s->cu_stream = s->hwctx->stream;
/* Аллокация output hw_frames_ctx — copy от input #0 с обновлёнными размерами */
out_ref = av_hwframe_ctx_alloc(hwfc0->device_ref);
if (!out_ref)
return AVERROR(ENOMEM);
@@ -296,14 +396,12 @@ static int cuda_grid_config_output(AVFilterLink *outlink)
}
outl->hw_frames_ctx = out_ref;
/* Setup framesync для lock-step pull от N inputs */
ret = ff_framesync_init(&s->fs, ctx, CUDA_GRID_INPUTS);
ret = ff_framesync_init(&s->fs, ctx, s->layout->nb_cells);
if (ret < 0)
return ret;
{
int i;
FFFrameSyncIn *fs_in;
for (i = 0; i < CUDA_GRID_INPUTS; i++) {
for (i = 0; i < s->layout->nb_cells; i++) {
fs_in = &s->fs.in[i];
fs_in->time_base = ctx->inputs[i]->time_base;
fs_in->sync = 1;
@@ -315,7 +413,6 @@ static int cuda_grid_config_output(AVFilterLink *outlink)
s->fs.on_event = cuda_grid_compose;
outlink->time_base = ctx->inputs[0]->time_base;
return ff_framesync_configure(&s->fs);
}
@@ -325,10 +422,18 @@ static int cuda_grid_activate(AVFilterContext *ctx)
return ff_framesync_activate(&s->fs);
}
/* ─── Filter registration ──────────────────────────────────────────────── */
/* ─── Options + registration ───────────────────────────────────────────── */
#define OFFSET(x) offsetof(CudaGridContext, x)
#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
static const AVOption cuda_grid_options[] = {
/* Phase 1: no options. Phase 2 добавит `layout=`. */
{ "layout", "имя layout template",
OFFSET(layout_name), AV_OPT_TYPE_STRING, { .str = "quad" }, 0, 0, FLAGS },
{ "out_w", "ширина output frame в пикселях",
OFFSET(out_width), AV_OPT_TYPE_INT, { .i64 = 1920 }, 16, 16384, FLAGS },
{ "out_h", "высота output frame в пикселях",
OFFSET(out_height), AV_OPT_TYPE_INT, { .i64 = 1080 }, 16, 16384, FLAGS },
{ NULL }
};
@@ -340,13 +445,6 @@ static const AVClass cuda_grid_class = {
.category = AV_CLASS_CATEGORY_FILTER,
};
static const AVFilterPad cuda_grid_inputs[] = {
{ .name = "input0", .type = AVMEDIA_TYPE_VIDEO, .config_props = cuda_grid_config_input },
{ .name = "input1", .type = AVMEDIA_TYPE_VIDEO, .config_props = cuda_grid_config_input },
{ .name = "input2", .type = AVMEDIA_TYPE_VIDEO, .config_props = cuda_grid_config_input },
{ .name = "input3", .type = AVMEDIA_TYPE_VIDEO, .config_props = cuda_grid_config_input },
};
static const AVFilterPad cuda_grid_outputs[] = {
{ .name = "default", .type = AVMEDIA_TYPE_VIDEO, .config_props = cuda_grid_config_output },
};
@@ -359,9 +457,9 @@ const AVFilter ff_vf_cuda_grid = {
.init = cuda_grid_init,
.uninit = cuda_grid_uninit,
.activate = cuda_grid_activate,
FILTER_INPUTS(cuda_grid_inputs),
/* No FILTER_INPUTS — pads added dynamically в init() per layout. */
FILTER_OUTPUTS(cuda_grid_outputs),
FILTER_SINGLE_PIXFMT(AV_PIX_FMT_CUDA),
.flags = AVFILTER_FLAG_HWDEVICE,
.flags = AVFILTER_FLAG_HWDEVICE | AVFILTER_FLAG_DYNAMIC_INPUTS,
.flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
};