vf_cuda_grid: Phase 1 MVP — fixed quad layout, 4 CUDA inputs → 1 output

Phase 1 deliverable (см. gx/vf-cuda-grid#1):
- libavfilter/vf_cuda_grid.c (~270 LOC): multi-input filter, fixed 2×2 quad
- 4 NV12 CUDA frames same size → 2W × 2H output frame
- Composition: cuMemcpy2DAsync per Y + UV plane на каждый input
- framesync для lock-step pull всех 4 inputs
- Output hw_frames_ctx allocated from input device_ref
- Build wiring: CONFIG_CUDA_GRID_FILTER → libavfilter/{Makefile,allfilters.c}, configure deps на ffnvcodec

Limitations Phase 1:
- All inputs must be same size (no scaling)
- Quad layout hardcoded (no DSL, no runtime switching)
- NV12 only (no RGBA/YUV420P)

Phase 2: dynamic layouts + scaling. Phase 3: runtime control via process_command.
This commit is contained in:
gx
2026-05-19 20:47:00 +01:00
parent 8ee2bd8ddb
commit 097ca81605
4 changed files with 362 additions and 0 deletions
Vendored
+1
View File
@@ -3317,6 +3317,7 @@ thumbnail_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
transpose_npp_filter_deps="ffnvcodec libnpp"
overlay_cuda_filter_deps="ffnvcodec"
overlay_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
cuda_grid_filter_deps="ffnvcodec"
sharpen_npp_filter_deps="ffnvcodec libnpp"
ddagrab_filter_deps="d3d11va IDXGIOutput1 DXGI_OUTDUPL_FRAME_INFO"
+1
View File
@@ -410,6 +410,7 @@ OBJS-$(CONFIG_OSCILLOSCOPE_FILTER) += vf_datascope.o
OBJS-$(CONFIG_OVERLAY_FILTER) += vf_overlay.o framesync.o
OBJS-$(CONFIG_OVERLAY_CUDA_FILTER) += vf_overlay_cuda.o framesync.o vf_overlay_cuda.ptx.o \
cuda/load_helper.o
OBJS-$(CONFIG_CUDA_GRID_FILTER) += vf_cuda_grid.o framesync.o
OBJS-$(CONFIG_OVERLAY_OPENCL_FILTER) += vf_overlay_opencl.o opencl.o \
opencl/overlay.o framesync.o
OBJS-$(CONFIG_OVERLAY_QSV_FILTER) += vf_overlay_qsv.o framesync.o
+1
View File
@@ -390,6 +390,7 @@ extern const AVFilter ff_vf_overlay_qsv;
extern const AVFilter ff_vf_overlay_vaapi;
extern const AVFilter ff_vf_overlay_vulkan;
extern const AVFilter ff_vf_overlay_cuda;
extern const AVFilter ff_vf_cuda_grid;
extern const AVFilter ff_vf_owdenoise;
extern const AVFilter ff_vf_pad;
extern const AVFilter ff_vf_pad_opencl;
+359
View File
@@ -0,0 +1,359 @@
/*
* cuda_grid — GPU-native video grid composer для FFmpeg 7.x.
*
* Принимает N CUDA-frames на входе, выдаёт один composed frame с N cells
* в layout. End-to-end CUDA (без CPU round-trip).
*
* Phase 1 (MVP): fixed quad layout 2×2, 4 NV12-inputs одинакового размера,
* output size = 2W × 2H, без scaling. Композиция через cuMemcpy2DAsync per
* Y/UV plane на каждый input → soответствующую quadrant'у output.
*
* Future phases (см. gx/vf-cuda-grid#1):
* - Phase 2: dynamic layouts + per-cell scaling
* - Phase 3: runtime layout switching через process_command (ZMQ)
* - Phase 4+: overlay primitives (rect/text/icon/image/dim/graph/chat)
*
* Лицензия: LGPL-2.1+ (соответствует FFmpeg)
*/
#include "config_components.h"
#include "libavutil/common.h"
#include "libavutil/hwcontext.h"
#include "libavutil/hwcontext_cuda_internal.h"
#include "libavutil/log.h"
#include "libavutil/mem.h"
#include "libavutil/opt.h"
#include "libavutil/pixdesc.h"
#include "avfilter.h"
#include "filters.h"
#include "formats.h"
#include "framesync.h"
#include "video.h"
#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x)
#define CUDA_GRID_INPUTS 4 /* Phase 1: fixed quad */
typedef struct CudaGridContext {
const AVClass *class;
AVBufferRef *hw_device_ctx;
AVCUDADeviceContext *hwctx;
CUcontext cu_ctx;
CUstream cu_stream;
FFFrameSync fs;
/* Output dimensions (computed in config_output) */
int out_width;
int out_height;
/* Per-cell target rectangles в output frame.
* Phase 1 hardcode: 4 ячейки 2×2 (top-left, top-right, bottom-left, bottom-right). */
struct {
int x, y, w, h;
} cells[CUDA_GRID_INPUTS];
} CudaGridContext;
/* ─── Composition: copy одного input plane в target region output ──────── */
static int copy_input_plane(AVFilterContext *ctx,
CUdeviceptr src_data,
int src_pitch,
int src_w,
int src_h,
CUdeviceptr dst_data,
int dst_pitch,
int dst_x,
int dst_y,
int bytes_per_pixel)
{
CudaGridContext *s = ctx->priv;
CUDA_MEMCPY2D cpy = {
.srcMemoryType = CU_MEMORYTYPE_DEVICE,
.srcDevice = src_data,
.srcPitch = src_pitch,
.dstMemoryType = CU_MEMORYTYPE_DEVICE,
.dstDevice = dst_data,
.dstXInBytes = (size_t)dst_x * bytes_per_pixel,
.dstY = dst_y,
.dstPitch = dst_pitch,
.WidthInBytes = (size_t)src_w * bytes_per_pixel,
.Height = src_h,
};
return CHECK_CU(s->hwctx->internal->cuda_dl->cuMemcpy2DAsync(&cpy, s->cu_stream));
}
/* ─── Framesync callback — N frames аre ready, compose ────────────────── */
static int cuda_grid_compose(FFFrameSync *fs)
{
AVFilterContext *ctx = fs->parent;
AVFilterLink *outlink = ctx->outputs[0];
CudaGridContext *s = ctx->priv;
AVFrame *out = NULL;
AVFrame *in[CUDA_GRID_INPUTS] = {0};
CUcontext dummy;
int ret;
/* Сбор всех N input frames из framesync */
for (int i = 0; i < CUDA_GRID_INPUTS; i++) {
ret = ff_framesync_get_frame(fs, i, &in[i], 0);
if (ret < 0)
return ret;
if (!in[i]) {
av_log(ctx, AV_LOG_WARNING, "input %d not ready, skipping frame\n", i);
return 0;
}
}
/* Output frame из output's hw_frames_pool */
out = ff_get_video_buffer(outlink, s->out_width, s->out_height);
if (!out)
return AVERROR(ENOMEM);
/* Copy props (timestamps, color metadata) от первого input */
ret = av_frame_copy_props(out, in[0]);
if (ret < 0)
goto fail;
out->width = s->out_width;
out->height = s->out_height;
/* CUDA context push для всех cuMemcpy в этом filter call */
ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPushCurrent(s->cu_ctx));
if (ret < 0)
goto fail;
/* Для каждого input — copy Y plane + UV plane в свою quadrant.
* NV12 layout: data[0] = Y, data[1] = UV interleaved. linesize[0/1] = pitch. */
for (int i = 0; i < CUDA_GRID_INPUTS; i++) {
AVFrame *src = in[i];
int cx = s->cells[i].x;
int cy = s->cells[i].y;
int cw = s->cells[i].w;
int ch = s->cells[i].h;
if (src->width != cw || src->height != ch) {
av_log(ctx, AV_LOG_ERROR,
"input %d size %dx%d != expected cell size %dx%d "
"(Phase 1: no scaling, all inputs must match cell size)\n",
i, src->width, src->height, cw, ch);
CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
ret = AVERROR(EINVAL);
goto fail;
}
/* Y plane (full resolution, 1 byte per pixel) */
ret = copy_input_plane(ctx,
(CUdeviceptr)src->data[0], src->linesize[0],
src->width, src->height,
(CUdeviceptr)out->data[0], out->linesize[0],
cx, cy, 1);
if (ret < 0) {
CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
goto fail;
}
/* UV plane (half resolution для NV12, но 2 bytes per "pixel" — interleaved UV) */
ret = copy_input_plane(ctx,
(CUdeviceptr)src->data[1], src->linesize[1],
src->width / 2, src->height / 2,
(CUdeviceptr)out->data[1], out->linesize[1],
cx / 2, cy / 2, 2);
if (ret < 0) {
CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
goto fail;
}
}
CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
return ff_filter_frame(outlink, out);
fail:
av_frame_free(&out);
return ret;
}
/* ─── Lifecycle: init / uninit / query_formats / config_input / config_output ─ */
static av_cold int cuda_grid_init(AVFilterContext *ctx)
{
CudaGridContext *s = ctx->priv;
/* Сами inputs регистрируем в filter struct (см. внизу — нельзя AVFILTER_INPUT_COUNT_MAX
* без явных AVFilterPad'ов). Phase 1 fix=4. */
(void)s;
return 0;
}
static av_cold void cuda_grid_uninit(AVFilterContext *ctx)
{
CudaGridContext *s = ctx->priv;
ff_framesync_uninit(&s->fs);
av_buffer_unref(&s->hw_device_ctx);
}
static int cuda_grid_config_input(AVFilterLink *inlink)
{
AVFilterContext *ctx = inlink->src;
FilterLink *inl = ff_filter_link(inlink);
if (!inl->hw_frames_ctx || !inl->hw_frames_ctx->data) {
av_log(ctx, AV_LOG_ERROR, "input %d: software pixel format не поддерживается\n",
FF_INLINK_IDX(inlink));
return AVERROR(EINVAL);
}
return 0;
}
static int cuda_grid_config_output(AVFilterLink *outlink)
{
AVFilterContext *ctx = outlink->src;
CudaGridContext *s = ctx->priv;
AVFilterLink *in0 = ctx->inputs[0];
FilterLink *inl0 = ff_filter_link(in0);
FilterLink *outl = ff_filter_link(outlink);
AVHWFramesContext *hwfc0;
int W, H, ret;
if (!inl0->hw_frames_ctx)
return AVERROR(EINVAL);
hwfc0 = (AVHWFramesContext *)inl0->hw_frames_ctx->data;
if (hwfc0->sw_format != AV_PIX_FMT_NV12) {
av_log(ctx, AV_LOG_ERROR,
"Phase 1 supports only NV12, got %s\n",
av_get_pix_fmt_name(hwfc0->sw_format));
return AVERROR(EINVAL);
}
/* Все inputs должны иметь одинаковый device и sw_format (Phase 1 also same size) */
W = in0->w;
H = in0->h;
for (int i = 1; i < CUDA_GRID_INPUTS; i++) {
AVFilterLink *inN = ctx->inputs[i];
FilterLink *ilN = ff_filter_link(inN);
AVHWFramesContext *hN;
if (!ilN->hw_frames_ctx)
return AVERROR(EINVAL);
hN = (AVHWFramesContext *)ilN->hw_frames_ctx->data;
if (hN->device_ctx != hwfc0->device_ctx) {
av_log(ctx, AV_LOG_ERROR, "input %d device mismatch\n", i);
return AVERROR(EINVAL);
}
if (hN->sw_format != hwfc0->sw_format) {
av_log(ctx, AV_LOG_ERROR, "input %d sw_format mismatch\n", i);
return AVERROR(EINVAL);
}
if (inN->w != W || inN->h != H) {
av_log(ctx, AV_LOG_ERROR,
"Phase 1: input %d size %dx%d != input 0 size %dx%d. "
"В этой фазе scaling не поддерживается, все inputs должны быть одного размера.\n",
i, inN->w, inN->h, W, H);
return AVERROR(EINVAL);
}
}
/* Output = 2W × 2H для quad layout */
s->out_width = 2 * W;
s->out_height = 2 * H;
outlink->w = s->out_width;
outlink->h = s->out_height;
/* Hardcoded quad cell positions */
s->cells[0].x = 0; s->cells[0].y = 0; s->cells[0].w = W; s->cells[0].h = H;
s->cells[1].x = W; s->cells[1].y = 0; s->cells[1].w = W; s->cells[1].h = H;
s->cells[2].x = 0; s->cells[2].y = H; s->cells[2].w = W; s->cells[2].h = H;
s->cells[3].x = W; s->cells[3].y = H; s->cells[3].w = W; s->cells[3].h = H;
/* Setup CUDA device + stream context из input 0 */
AVHWDeviceContext *hwdev = hwfc0->device_ctx;
s->hwctx = (AVCUDADeviceContext *)hwdev->hwctx;
s->cu_ctx = s->hwctx->cuda_ctx;
s->cu_stream = s->hwctx->stream;
/* Аллокация output hw_frames_ctx — copy от input #0 с обновлёнными размерами */
AVBufferRef *out_ref = av_hwframe_ctx_alloc(hwfc0->device_ref);
if (!out_ref)
return AVERROR(ENOMEM);
AVHWFramesContext *out_hwfc = (AVHWFramesContext *)out_ref->data;
out_hwfc->format = AV_PIX_FMT_CUDA;
out_hwfc->sw_format = AV_PIX_FMT_NV12;
out_hwfc->width = s->out_width;
out_hwfc->height = s->out_height;
out_hwfc->initial_pool_size = 4;
ret = av_hwframe_ctx_init(out_ref);
if (ret < 0) {
av_buffer_unref(&out_ref);
return ret;
}
outl->hw_frames_ctx = out_ref;
/* Setup framesync для lock-step pull от N inputs */
ret = ff_framesync_init(&s->fs, ctx, CUDA_GRID_INPUTS);
if (ret < 0)
return ret;
for (int i = 0; i < CUDA_GRID_INPUTS; i++) {
FFFrameSyncIn *fs_in = &s->fs.in[i];
fs_in->time_base = ctx->inputs[i]->time_base;
fs_in->sync = 1;
fs_in->before = EXT_STOP;
fs_in->after = EXT_INFINITY;
}
s->fs.opaque = s;
s->fs.on_event = cuda_grid_compose;
outlink->time_base = ctx->inputs[0]->time_base;
return ff_framesync_configure(&s->fs);
}
static int cuda_grid_activate(AVFilterContext *ctx)
{
CudaGridContext *s = ctx->priv;
return ff_framesync_activate(&s->fs);
}
/* ─── Filter registration ──────────────────────────────────────────────── */
static const AVOption cuda_grid_options[] = {
/* Phase 1: no options. Phase 2 добавит `layout=`. */
{ NULL }
};
static const AVClass cuda_grid_class = {
.class_name = "cuda_grid",
.item_name = av_default_item_name,
.option = cuda_grid_options,
.version = LIBAVUTIL_VERSION_INT,
.category = AV_CLASS_CATEGORY_FILTER,
};
static const AVFilterPad cuda_grid_inputs[] = {
{ .name = "input0", .type = AVMEDIA_TYPE_VIDEO, .config_props = cuda_grid_config_input },
{ .name = "input1", .type = AVMEDIA_TYPE_VIDEO, .config_props = cuda_grid_config_input },
{ .name = "input2", .type = AVMEDIA_TYPE_VIDEO, .config_props = cuda_grid_config_input },
{ .name = "input3", .type = AVMEDIA_TYPE_VIDEO, .config_props = cuda_grid_config_input },
};
static const AVFilterPad cuda_grid_outputs[] = {
{ .name = "default", .type = AVMEDIA_TYPE_VIDEO, .config_props = cuda_grid_config_output },
};
const AVFilter ff_vf_cuda_grid = {
.name = "cuda_grid",
.description = NULL_IF_CONFIG_SMALL("GPU-native video grid composer (CUDA)."),
.priv_class = &cuda_grid_class,
.priv_size = sizeof(CudaGridContext),
.init = cuda_grid_init,
.uninit = cuda_grid_uninit,
.activate = cuda_grid_activate,
FILTER_INPUTS(cuda_grid_inputs),
FILTER_OUTPUTS(cuda_grid_outputs),
FILTER_SINGLE_PIXFMT(AV_PIX_FMT_CUDA),
.flags = AVFILTER_FLAG_HWDEVICE,
.flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
};