vf_cuda_grid: Phase 1 MVP — fixed quad layout, 4 CUDA inputs → 1 output

Phase 1 deliverable (см. gx/vf-cuda-grid#1): - libavfilter/vf_cuda_grid.c (~270 LOC): multi-input filter, fixed 2×2 quad - 4 NV12 CUDA frames same size → 2W × 2H output frame - Composition: cuMemcpy2DAsync per Y + UV plane на каждый input - framesync для lock-step pull всех 4 inputs - Output hw_frames_ctx allocated from input device_ref - Build wiring: CONFIG_CUDA_GRID_FILTER → libavfilter/{Makefile,allfilters.c}, configure deps на ffnvcodec Limitations Phase 1: - All inputs must be same size (no scaling) - Quad layout hardcoded (no DSL, no runtime switching) - NV12 only (no RGBA/YUV420P) Phase 2: dynamic layouts + scaling. Phase 3: runtime control via process_command.
2026-05-19 20:47:00 +01:00
parent 8ee2bd8ddb
commit 097ca81605
4 changed files with 362 additions and 0 deletions
@@ -3317,6 +3317,7 @@ thumbnail_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 transpose_npp_filter_deps="ffnvcodec libnpp"
 overlay_cuda_filter_deps="ffnvcodec"
 overlay_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
+cuda_grid_filter_deps="ffnvcodec"
 sharpen_npp_filter_deps="ffnvcodec libnpp"

 ddagrab_filter_deps="d3d11va IDXGIOutput1 DXGI_OUTDUPL_FRAME_INFO"
@@ -410,6 +410,7 @@ OBJS-$(CONFIG_OSCILLOSCOPE_FILTER)           += vf_datascope.o
 OBJS-$(CONFIG_OVERLAY_FILTER)                += vf_overlay.o framesync.o
 OBJS-$(CONFIG_OVERLAY_CUDA_FILTER)           += vf_overlay_cuda.o framesync.o vf_overlay_cuda.ptx.o \
                                                cuda/load_helper.o
+OBJS-$(CONFIG_CUDA_GRID_FILTER)              += vf_cuda_grid.o framesync.o
 OBJS-$(CONFIG_OVERLAY_OPENCL_FILTER)         += vf_overlay_opencl.o opencl.o \
                                                opencl/overlay.o framesync.o
 OBJS-$(CONFIG_OVERLAY_QSV_FILTER)            += vf_overlay_qsv.o framesync.o
@@ -390,6 +390,7 @@ extern const AVFilter ff_vf_overlay_qsv;
 extern const AVFilter ff_vf_overlay_vaapi;
 extern const AVFilter ff_vf_overlay_vulkan;
 extern const AVFilter ff_vf_overlay_cuda;
+extern const AVFilter ff_vf_cuda_grid;
 extern const AVFilter ff_vf_owdenoise;
 extern const AVFilter ff_vf_pad;
 extern const AVFilter ff_vf_pad_opencl;
@@ -0,0 +1,359 @@
+/*
+ * cuda_grid — GPU-native video grid composer для FFmpeg 7.x.
+ *
+ * Принимает N CUDA-frames на входе, выдаёт один composed frame с N cells
+ * в layout. End-to-end CUDA (без CPU round-trip).
+ *
+ * Phase 1 (MVP): fixed quad layout 2×2, 4 NV12-inputs одинакового размера,
+ * output size = 2W × 2H, без scaling. Композиция через cuMemcpy2DAsync per
+ * Y/UV plane на каждый input → soответствующую quadrant'у output.
+ *
+ * Future phases (см. gx/vf-cuda-grid#1):
+ *  - Phase 2: dynamic layouts + per-cell scaling
+ *  - Phase 3: runtime layout switching через process_command (ZMQ)
+ *  - Phase 4+: overlay primitives (rect/text/icon/image/dim/graph/chat)
+ *
+ * Лицензия: LGPL-2.1+ (соответствует FFmpeg)
+ */
+
+#include "config_components.h"
+
+#include "libavutil/common.h"
+#include "libavutil/hwcontext.h"
+#include "libavutil/hwcontext_cuda_internal.h"
+#include "libavutil/log.h"
+#include "libavutil/mem.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+
+#include "avfilter.h"
+#include "filters.h"
+#include "formats.h"
+#include "framesync.h"
+#include "video.h"
+
+#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x)
+
+#define CUDA_GRID_INPUTS 4  /* Phase 1: fixed quad */
+
+typedef struct CudaGridContext {
+    const AVClass *class;
+
+    AVBufferRef *hw_device_ctx;
+    AVCUDADeviceContext *hwctx;
+    CUcontext cu_ctx;
+    CUstream  cu_stream;
+
+    FFFrameSync fs;
+
+    /* Output dimensions (computed in config_output) */
+    int out_width;
+    int out_height;
+
+    /* Per-cell target rectangles в output frame.
+     * Phase 1 hardcode: 4 ячейки 2×2 (top-left, top-right, bottom-left, bottom-right). */
+    struct {
+        int x, y, w, h;
+    } cells[CUDA_GRID_INPUTS];
+} CudaGridContext;
+
+/* ─── Composition: copy одного input plane в target region output ──────── */
+
+static int copy_input_plane(AVFilterContext *ctx,
+                            CUdeviceptr     src_data,
+                            int             src_pitch,
+                            int             src_w,
+                            int             src_h,
+                            CUdeviceptr     dst_data,
+                            int             dst_pitch,
+                            int             dst_x,
+                            int             dst_y,
+                            int             bytes_per_pixel)
+{
+    CudaGridContext *s = ctx->priv;
+    CUDA_MEMCPY2D cpy = {
+        .srcMemoryType = CU_MEMORYTYPE_DEVICE,
+        .srcDevice     = src_data,
+        .srcPitch      = src_pitch,
+        .dstMemoryType = CU_MEMORYTYPE_DEVICE,
+        .dstDevice     = dst_data,
+        .dstXInBytes   = (size_t)dst_x * bytes_per_pixel,
+        .dstY          = dst_y,
+        .dstPitch      = dst_pitch,
+        .WidthInBytes  = (size_t)src_w * bytes_per_pixel,
+        .Height        = src_h,
+    };
+    return CHECK_CU(s->hwctx->internal->cuda_dl->cuMemcpy2DAsync(&cpy, s->cu_stream));
+}
+
+/* ─── Framesync callback — N frames аre ready, compose ────────────────── */
+
+static int cuda_grid_compose(FFFrameSync *fs)
+{
+    AVFilterContext *ctx     = fs->parent;
+    AVFilterLink    *outlink = ctx->outputs[0];
+    CudaGridContext *s       = ctx->priv;
+    AVFrame *out = NULL;
+    AVFrame *in[CUDA_GRID_INPUTS] = {0};
+    CUcontext dummy;
+    int ret;
+
+    /* Сбор всех N input frames из framesync */
+    for (int i = 0; i < CUDA_GRID_INPUTS; i++) {
+        ret = ff_framesync_get_frame(fs, i, &in[i], 0);
+        if (ret < 0)
+            return ret;
+        if (!in[i]) {
+            av_log(ctx, AV_LOG_WARNING, "input %d not ready, skipping frame\n", i);
+            return 0;
+        }
+    }
+
+    /* Output frame из output's hw_frames_pool */
+    out = ff_get_video_buffer(outlink, s->out_width, s->out_height);
+    if (!out)
+        return AVERROR(ENOMEM);
+
+    /* Copy props (timestamps, color metadata) от первого input */
+    ret = av_frame_copy_props(out, in[0]);
+    if (ret < 0)
+        goto fail;
+    out->width  = s->out_width;
+    out->height = s->out_height;
+
+    /* CUDA context push для всех cuMemcpy в этом filter call */
+    ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPushCurrent(s->cu_ctx));
+    if (ret < 0)
+        goto fail;
+
+    /* Для каждого input — copy Y plane + UV plane в свою quadrant.
+     * NV12 layout: data[0] = Y, data[1] = UV interleaved. linesize[0/1] = pitch. */
+    for (int i = 0; i < CUDA_GRID_INPUTS; i++) {
+        AVFrame *src = in[i];
+        int cx = s->cells[i].x;
+        int cy = s->cells[i].y;
+        int cw = s->cells[i].w;
+        int ch = s->cells[i].h;
+
+        if (src->width != cw || src->height != ch) {
+            av_log(ctx, AV_LOG_ERROR,
+                   "input %d size %dx%d != expected cell size %dx%d "
+                   "(Phase 1: no scaling, all inputs must match cell size)\n",
+                   i, src->width, src->height, cw, ch);
+            CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
+            ret = AVERROR(EINVAL);
+            goto fail;
+        }
+
+        /* Y plane (full resolution, 1 byte per pixel) */
+        ret = copy_input_plane(ctx,
+                               (CUdeviceptr)src->data[0], src->linesize[0],
+                               src->width, src->height,
+                               (CUdeviceptr)out->data[0], out->linesize[0],
+                               cx, cy, 1);
+        if (ret < 0) {
+            CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
+            goto fail;
+        }
+
+        /* UV plane (half resolution для NV12, но 2 bytes per "pixel" — interleaved UV) */
+        ret = copy_input_plane(ctx,
+                               (CUdeviceptr)src->data[1], src->linesize[1],
+                               src->width / 2, src->height / 2,
+                               (CUdeviceptr)out->data[1], out->linesize[1],
+                               cx / 2, cy / 2, 2);
+        if (ret < 0) {
+            CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
+            goto fail;
+        }
+    }
+
+    CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
+
+    return ff_filter_frame(outlink, out);
+
+fail:
+    av_frame_free(&out);
+    return ret;
+}
+
+/* ─── Lifecycle: init / uninit / query_formats / config_input / config_output ─ */
+
+static av_cold int cuda_grid_init(AVFilterContext *ctx)
+{
+    CudaGridContext *s = ctx->priv;
+    /* Сами inputs регистрируем в filter struct (см. внизу — нельзя AVFILTER_INPUT_COUNT_MAX
+     * без явных AVFilterPad'ов). Phase 1 fix=4. */
+    (void)s;
+    return 0;
+}
+
+static av_cold void cuda_grid_uninit(AVFilterContext *ctx)
+{
+    CudaGridContext *s = ctx->priv;
+    ff_framesync_uninit(&s->fs);
+    av_buffer_unref(&s->hw_device_ctx);
+}
+
+static int cuda_grid_config_input(AVFilterLink *inlink)
+{
+    AVFilterContext *ctx = inlink->src;
+    FilterLink      *inl = ff_filter_link(inlink);
+
+    if (!inl->hw_frames_ctx || !inl->hw_frames_ctx->data) {
+        av_log(ctx, AV_LOG_ERROR, "input %d: software pixel format не поддерживается\n",
+               FF_INLINK_IDX(inlink));
+        return AVERROR(EINVAL);
+    }
+    return 0;
+}
+
+static int cuda_grid_config_output(AVFilterLink *outlink)
+{
+    AVFilterContext     *ctx  = outlink->src;
+    CudaGridContext     *s    = ctx->priv;
+    AVFilterLink        *in0  = ctx->inputs[0];
+    FilterLink          *inl0 = ff_filter_link(in0);
+    FilterLink          *outl = ff_filter_link(outlink);
+    AVHWFramesContext   *hwfc0;
+    int W, H, ret;
+
+    if (!inl0->hw_frames_ctx)
+        return AVERROR(EINVAL);
+    hwfc0 = (AVHWFramesContext *)inl0->hw_frames_ctx->data;
+
+    if (hwfc0->sw_format != AV_PIX_FMT_NV12) {
+        av_log(ctx, AV_LOG_ERROR,
+               "Phase 1 supports only NV12, got %s\n",
+               av_get_pix_fmt_name(hwfc0->sw_format));
+        return AVERROR(EINVAL);
+    }
+
+    /* Все inputs должны иметь одинаковый device и sw_format (Phase 1 also same size) */
+    W = in0->w;
+    H = in0->h;
+    for (int i = 1; i < CUDA_GRID_INPUTS; i++) {
+        AVFilterLink *inN = ctx->inputs[i];
+        FilterLink   *ilN = ff_filter_link(inN);
+        AVHWFramesContext *hN;
+        if (!ilN->hw_frames_ctx)
+            return AVERROR(EINVAL);
+        hN = (AVHWFramesContext *)ilN->hw_frames_ctx->data;
+        if (hN->device_ctx != hwfc0->device_ctx) {
+            av_log(ctx, AV_LOG_ERROR, "input %d device mismatch\n", i);
+            return AVERROR(EINVAL);
+        }
+        if (hN->sw_format != hwfc0->sw_format) {
+            av_log(ctx, AV_LOG_ERROR, "input %d sw_format mismatch\n", i);
+            return AVERROR(EINVAL);
+        }
+        if (inN->w != W || inN->h != H) {
+            av_log(ctx, AV_LOG_ERROR,
+                   "Phase 1: input %d size %dx%d != input 0 size %dx%d. "
+                   "В этой фазе scaling не поддерживается, все inputs должны быть одного размера.\n",
+                   i, inN->w, inN->h, W, H);
+            return AVERROR(EINVAL);
+        }
+    }
+
+    /* Output = 2W × 2H для quad layout */
+    s->out_width  = 2 * W;
+    s->out_height = 2 * H;
+    outlink->w = s->out_width;
+    outlink->h = s->out_height;
+
+    /* Hardcoded quad cell positions */
+    s->cells[0].x = 0;     s->cells[0].y = 0;     s->cells[0].w = W; s->cells[0].h = H;
+    s->cells[1].x = W;     s->cells[1].y = 0;     s->cells[1].w = W; s->cells[1].h = H;
+    s->cells[2].x = 0;     s->cells[2].y = H;     s->cells[2].w = W; s->cells[2].h = H;
+    s->cells[3].x = W;     s->cells[3].y = H;     s->cells[3].w = W; s->cells[3].h = H;
+
+    /* Setup CUDA device + stream context из input 0 */
+    AVHWDeviceContext *hwdev = hwfc0->device_ctx;
+    s->hwctx    = (AVCUDADeviceContext *)hwdev->hwctx;
+    s->cu_ctx   = s->hwctx->cuda_ctx;
+    s->cu_stream = s->hwctx->stream;
+
+    /* Аллокация output hw_frames_ctx — copy от input #0 с обновлёнными размерами */
+    AVBufferRef *out_ref = av_hwframe_ctx_alloc(hwfc0->device_ref);
+    if (!out_ref)
+        return AVERROR(ENOMEM);
+    AVHWFramesContext *out_hwfc = (AVHWFramesContext *)out_ref->data;
+    out_hwfc->format    = AV_PIX_FMT_CUDA;
+    out_hwfc->sw_format = AV_PIX_FMT_NV12;
+    out_hwfc->width     = s->out_width;
+    out_hwfc->height    = s->out_height;
+    out_hwfc->initial_pool_size = 4;
+
+    ret = av_hwframe_ctx_init(out_ref);
+    if (ret < 0) {
+        av_buffer_unref(&out_ref);
+        return ret;
+    }
+    outl->hw_frames_ctx = out_ref;
+
+    /* Setup framesync для lock-step pull от N inputs */
+    ret = ff_framesync_init(&s->fs, ctx, CUDA_GRID_INPUTS);
+    if (ret < 0)
+        return ret;
+    for (int i = 0; i < CUDA_GRID_INPUTS; i++) {
+        FFFrameSyncIn *fs_in = &s->fs.in[i];
+        fs_in->time_base = ctx->inputs[i]->time_base;
+        fs_in->sync   = 1;
+        fs_in->before = EXT_STOP;
+        fs_in->after  = EXT_INFINITY;
+    }
+    s->fs.opaque   = s;
+    s->fs.on_event = cuda_grid_compose;
+
+    outlink->time_base = ctx->inputs[0]->time_base;
+
+    return ff_framesync_configure(&s->fs);
+}
+
+static int cuda_grid_activate(AVFilterContext *ctx)
+{
+    CudaGridContext *s = ctx->priv;
+    return ff_framesync_activate(&s->fs);
+}
+
+/* ─── Filter registration ──────────────────────────────────────────────── */
+
+static const AVOption cuda_grid_options[] = {
+    /* Phase 1: no options. Phase 2 добавит `layout=`. */
+    { NULL }
+};
+
+static const AVClass cuda_grid_class = {
+    .class_name = "cuda_grid",
+    .item_name  = av_default_item_name,
+    .option     = cuda_grid_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+    .category   = AV_CLASS_CATEGORY_FILTER,
+};
+
+static const AVFilterPad cuda_grid_inputs[] = {
+    { .name = "input0", .type = AVMEDIA_TYPE_VIDEO, .config_props = cuda_grid_config_input },
+    { .name = "input1", .type = AVMEDIA_TYPE_VIDEO, .config_props = cuda_grid_config_input },
+    { .name = "input2", .type = AVMEDIA_TYPE_VIDEO, .config_props = cuda_grid_config_input },
+    { .name = "input3", .type = AVMEDIA_TYPE_VIDEO, .config_props = cuda_grid_config_input },
+};
+
+static const AVFilterPad cuda_grid_outputs[] = {
+    { .name = "default", .type = AVMEDIA_TYPE_VIDEO, .config_props = cuda_grid_config_output },
+};
+
+const AVFilter ff_vf_cuda_grid = {
+    .name           = "cuda_grid",
+    .description    = NULL_IF_CONFIG_SMALL("GPU-native video grid composer (CUDA)."),
+    .priv_class     = &cuda_grid_class,
+    .priv_size      = sizeof(CudaGridContext),
+    .init           = cuda_grid_init,
+    .uninit         = cuda_grid_uninit,
+    .activate       = cuda_grid_activate,
+    FILTER_INPUTS(cuda_grid_inputs),
+    FILTER_OUTPUTS(cuda_grid_outputs),
+    FILTER_SINGLE_PIXFMT(AV_PIX_FMT_CUDA),
+    .flags          = AVFILTER_FLAG_HWDEVICE,
+    .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
+};