diff --git a/configure b/configure index c24aa94..9c60cb7 100755 --- a/configure +++ b/configure @@ -3317,6 +3317,7 @@ thumbnail_cuda_filter_deps_any="cuda_nvcc cuda_llvm" transpose_npp_filter_deps="ffnvcodec libnpp" overlay_cuda_filter_deps="ffnvcodec" overlay_cuda_filter_deps_any="cuda_nvcc cuda_llvm" +cuda_grid_filter_deps="ffnvcodec" sharpen_npp_filter_deps="ffnvcodec libnpp" ddagrab_filter_deps="d3d11va IDXGIOutput1 DXGI_OUTDUPL_FRAME_INFO" diff --git a/libavfilter/Makefile b/libavfilter/Makefile index 91487af..8015a49 100644 --- a/libavfilter/Makefile +++ b/libavfilter/Makefile @@ -410,6 +410,7 @@ OBJS-$(CONFIG_OSCILLOSCOPE_FILTER) += vf_datascope.o OBJS-$(CONFIG_OVERLAY_FILTER) += vf_overlay.o framesync.o OBJS-$(CONFIG_OVERLAY_CUDA_FILTER) += vf_overlay_cuda.o framesync.o vf_overlay_cuda.ptx.o \ cuda/load_helper.o +OBJS-$(CONFIG_CUDA_GRID_FILTER) += vf_cuda_grid.o framesync.o OBJS-$(CONFIG_OVERLAY_OPENCL_FILTER) += vf_overlay_opencl.o opencl.o \ opencl/overlay.o framesync.o OBJS-$(CONFIG_OVERLAY_QSV_FILTER) += vf_overlay_qsv.o framesync.o diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c index 9819f0f..bc0d00a 100644 --- a/libavfilter/allfilters.c +++ b/libavfilter/allfilters.c @@ -390,6 +390,7 @@ extern const AVFilter ff_vf_overlay_qsv; extern const AVFilter ff_vf_overlay_vaapi; extern const AVFilter ff_vf_overlay_vulkan; extern const AVFilter ff_vf_overlay_cuda; +extern const AVFilter ff_vf_cuda_grid; extern const AVFilter ff_vf_owdenoise; extern const AVFilter ff_vf_pad; extern const AVFilter ff_vf_pad_opencl; diff --git a/libavfilter/vf_cuda_grid.c b/libavfilter/vf_cuda_grid.c new file mode 100644 index 0000000..34003fe --- /dev/null +++ b/libavfilter/vf_cuda_grid.c @@ -0,0 +1,359 @@ +/* + * cuda_grid — GPU-native video grid composer для FFmpeg 7.x. + * + * Принимает N CUDA-frames на входе, выдаёт один composed frame с N cells + * в layout. End-to-end CUDA (без CPU round-trip). + * + * Phase 1 (MVP): fixed quad layout 2×2, 4 NV12-inputs одинакового размера, + * output size = 2W × 2H, без scaling. Композиция через cuMemcpy2DAsync per + * Y/UV plane на каждый input → soответствующую quadrant'у output. + * + * Future phases (см. gx/vf-cuda-grid#1): + * - Phase 2: dynamic layouts + per-cell scaling + * - Phase 3: runtime layout switching через process_command (ZMQ) + * - Phase 4+: overlay primitives (rect/text/icon/image/dim/graph/chat) + * + * Лицензия: LGPL-2.1+ (соответствует FFmpeg) + */ + +#include "config_components.h" + +#include "libavutil/common.h" +#include "libavutil/hwcontext.h" +#include "libavutil/hwcontext_cuda_internal.h" +#include "libavutil/log.h" +#include "libavutil/mem.h" +#include "libavutil/opt.h" +#include "libavutil/pixdesc.h" + +#include "avfilter.h" +#include "filters.h" +#include "formats.h" +#include "framesync.h" +#include "video.h" + +#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x) + +#define CUDA_GRID_INPUTS 4 /* Phase 1: fixed quad */ + +typedef struct CudaGridContext { + const AVClass *class; + + AVBufferRef *hw_device_ctx; + AVCUDADeviceContext *hwctx; + CUcontext cu_ctx; + CUstream cu_stream; + + FFFrameSync fs; + + /* Output dimensions (computed in config_output) */ + int out_width; + int out_height; + + /* Per-cell target rectangles в output frame. + * Phase 1 hardcode: 4 ячейки 2×2 (top-left, top-right, bottom-left, bottom-right). */ + struct { + int x, y, w, h; + } cells[CUDA_GRID_INPUTS]; +} CudaGridContext; + +/* ─── Composition: copy одного input plane в target region output ──────── */ + +static int copy_input_plane(AVFilterContext *ctx, + CUdeviceptr src_data, + int src_pitch, + int src_w, + int src_h, + CUdeviceptr dst_data, + int dst_pitch, + int dst_x, + int dst_y, + int bytes_per_pixel) +{ + CudaGridContext *s = ctx->priv; + CUDA_MEMCPY2D cpy = { + .srcMemoryType = CU_MEMORYTYPE_DEVICE, + .srcDevice = src_data, + .srcPitch = src_pitch, + .dstMemoryType = CU_MEMORYTYPE_DEVICE, + .dstDevice = dst_data, + .dstXInBytes = (size_t)dst_x * bytes_per_pixel, + .dstY = dst_y, + .dstPitch = dst_pitch, + .WidthInBytes = (size_t)src_w * bytes_per_pixel, + .Height = src_h, + }; + return CHECK_CU(s->hwctx->internal->cuda_dl->cuMemcpy2DAsync(&cpy, s->cu_stream)); +} + +/* ─── Framesync callback — N frames аre ready, compose ────────────────── */ + +static int cuda_grid_compose(FFFrameSync *fs) +{ + AVFilterContext *ctx = fs->parent; + AVFilterLink *outlink = ctx->outputs[0]; + CudaGridContext *s = ctx->priv; + AVFrame *out = NULL; + AVFrame *in[CUDA_GRID_INPUTS] = {0}; + CUcontext dummy; + int ret; + + /* Сбор всех N input frames из framesync */ + for (int i = 0; i < CUDA_GRID_INPUTS; i++) { + ret = ff_framesync_get_frame(fs, i, &in[i], 0); + if (ret < 0) + return ret; + if (!in[i]) { + av_log(ctx, AV_LOG_WARNING, "input %d not ready, skipping frame\n", i); + return 0; + } + } + + /* Output frame из output's hw_frames_pool */ + out = ff_get_video_buffer(outlink, s->out_width, s->out_height); + if (!out) + return AVERROR(ENOMEM); + + /* Copy props (timestamps, color metadata) от первого input */ + ret = av_frame_copy_props(out, in[0]); + if (ret < 0) + goto fail; + out->width = s->out_width; + out->height = s->out_height; + + /* CUDA context push для всех cuMemcpy в этом filter call */ + ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPushCurrent(s->cu_ctx)); + if (ret < 0) + goto fail; + + /* Для каждого input — copy Y plane + UV plane в свою quadrant. + * NV12 layout: data[0] = Y, data[1] = UV interleaved. linesize[0/1] = pitch. */ + for (int i = 0; i < CUDA_GRID_INPUTS; i++) { + AVFrame *src = in[i]; + int cx = s->cells[i].x; + int cy = s->cells[i].y; + int cw = s->cells[i].w; + int ch = s->cells[i].h; + + if (src->width != cw || src->height != ch) { + av_log(ctx, AV_LOG_ERROR, + "input %d size %dx%d != expected cell size %dx%d " + "(Phase 1: no scaling, all inputs must match cell size)\n", + i, src->width, src->height, cw, ch); + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); + ret = AVERROR(EINVAL); + goto fail; + } + + /* Y plane (full resolution, 1 byte per pixel) */ + ret = copy_input_plane(ctx, + (CUdeviceptr)src->data[0], src->linesize[0], + src->width, src->height, + (CUdeviceptr)out->data[0], out->linesize[0], + cx, cy, 1); + if (ret < 0) { + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); + goto fail; + } + + /* UV plane (half resolution для NV12, но 2 bytes per "pixel" — interleaved UV) */ + ret = copy_input_plane(ctx, + (CUdeviceptr)src->data[1], src->linesize[1], + src->width / 2, src->height / 2, + (CUdeviceptr)out->data[1], out->linesize[1], + cx / 2, cy / 2, 2); + if (ret < 0) { + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); + goto fail; + } + } + + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); + + return ff_filter_frame(outlink, out); + +fail: + av_frame_free(&out); + return ret; +} + +/* ─── Lifecycle: init / uninit / query_formats / config_input / config_output ─ */ + +static av_cold int cuda_grid_init(AVFilterContext *ctx) +{ + CudaGridContext *s = ctx->priv; + /* Сами inputs регистрируем в filter struct (см. внизу — нельзя AVFILTER_INPUT_COUNT_MAX + * без явных AVFilterPad'ов). Phase 1 fix=4. */ + (void)s; + return 0; +} + +static av_cold void cuda_grid_uninit(AVFilterContext *ctx) +{ + CudaGridContext *s = ctx->priv; + ff_framesync_uninit(&s->fs); + av_buffer_unref(&s->hw_device_ctx); +} + +static int cuda_grid_config_input(AVFilterLink *inlink) +{ + AVFilterContext *ctx = inlink->src; + FilterLink *inl = ff_filter_link(inlink); + + if (!inl->hw_frames_ctx || !inl->hw_frames_ctx->data) { + av_log(ctx, AV_LOG_ERROR, "input %d: software pixel format не поддерживается\n", + FF_INLINK_IDX(inlink)); + return AVERROR(EINVAL); + } + return 0; +} + +static int cuda_grid_config_output(AVFilterLink *outlink) +{ + AVFilterContext *ctx = outlink->src; + CudaGridContext *s = ctx->priv; + AVFilterLink *in0 = ctx->inputs[0]; + FilterLink *inl0 = ff_filter_link(in0); + FilterLink *outl = ff_filter_link(outlink); + AVHWFramesContext *hwfc0; + int W, H, ret; + + if (!inl0->hw_frames_ctx) + return AVERROR(EINVAL); + hwfc0 = (AVHWFramesContext *)inl0->hw_frames_ctx->data; + + if (hwfc0->sw_format != AV_PIX_FMT_NV12) { + av_log(ctx, AV_LOG_ERROR, + "Phase 1 supports only NV12, got %s\n", + av_get_pix_fmt_name(hwfc0->sw_format)); + return AVERROR(EINVAL); + } + + /* Все inputs должны иметь одинаковый device и sw_format (Phase 1 also same size) */ + W = in0->w; + H = in0->h; + for (int i = 1; i < CUDA_GRID_INPUTS; i++) { + AVFilterLink *inN = ctx->inputs[i]; + FilterLink *ilN = ff_filter_link(inN); + AVHWFramesContext *hN; + if (!ilN->hw_frames_ctx) + return AVERROR(EINVAL); + hN = (AVHWFramesContext *)ilN->hw_frames_ctx->data; + if (hN->device_ctx != hwfc0->device_ctx) { + av_log(ctx, AV_LOG_ERROR, "input %d device mismatch\n", i); + return AVERROR(EINVAL); + } + if (hN->sw_format != hwfc0->sw_format) { + av_log(ctx, AV_LOG_ERROR, "input %d sw_format mismatch\n", i); + return AVERROR(EINVAL); + } + if (inN->w != W || inN->h != H) { + av_log(ctx, AV_LOG_ERROR, + "Phase 1: input %d size %dx%d != input 0 size %dx%d. " + "В этой фазе scaling не поддерживается, все inputs должны быть одного размера.\n", + i, inN->w, inN->h, W, H); + return AVERROR(EINVAL); + } + } + + /* Output = 2W × 2H для quad layout */ + s->out_width = 2 * W; + s->out_height = 2 * H; + outlink->w = s->out_width; + outlink->h = s->out_height; + + /* Hardcoded quad cell positions */ + s->cells[0].x = 0; s->cells[0].y = 0; s->cells[0].w = W; s->cells[0].h = H; + s->cells[1].x = W; s->cells[1].y = 0; s->cells[1].w = W; s->cells[1].h = H; + s->cells[2].x = 0; s->cells[2].y = H; s->cells[2].w = W; s->cells[2].h = H; + s->cells[3].x = W; s->cells[3].y = H; s->cells[3].w = W; s->cells[3].h = H; + + /* Setup CUDA device + stream context из input 0 */ + AVHWDeviceContext *hwdev = hwfc0->device_ctx; + s->hwctx = (AVCUDADeviceContext *)hwdev->hwctx; + s->cu_ctx = s->hwctx->cuda_ctx; + s->cu_stream = s->hwctx->stream; + + /* Аллокация output hw_frames_ctx — copy от input #0 с обновлёнными размерами */ + AVBufferRef *out_ref = av_hwframe_ctx_alloc(hwfc0->device_ref); + if (!out_ref) + return AVERROR(ENOMEM); + AVHWFramesContext *out_hwfc = (AVHWFramesContext *)out_ref->data; + out_hwfc->format = AV_PIX_FMT_CUDA; + out_hwfc->sw_format = AV_PIX_FMT_NV12; + out_hwfc->width = s->out_width; + out_hwfc->height = s->out_height; + out_hwfc->initial_pool_size = 4; + + ret = av_hwframe_ctx_init(out_ref); + if (ret < 0) { + av_buffer_unref(&out_ref); + return ret; + } + outl->hw_frames_ctx = out_ref; + + /* Setup framesync для lock-step pull от N inputs */ + ret = ff_framesync_init(&s->fs, ctx, CUDA_GRID_INPUTS); + if (ret < 0) + return ret; + for (int i = 0; i < CUDA_GRID_INPUTS; i++) { + FFFrameSyncIn *fs_in = &s->fs.in[i]; + fs_in->time_base = ctx->inputs[i]->time_base; + fs_in->sync = 1; + fs_in->before = EXT_STOP; + fs_in->after = EXT_INFINITY; + } + s->fs.opaque = s; + s->fs.on_event = cuda_grid_compose; + + outlink->time_base = ctx->inputs[0]->time_base; + + return ff_framesync_configure(&s->fs); +} + +static int cuda_grid_activate(AVFilterContext *ctx) +{ + CudaGridContext *s = ctx->priv; + return ff_framesync_activate(&s->fs); +} + +/* ─── Filter registration ──────────────────────────────────────────────── */ + +static const AVOption cuda_grid_options[] = { + /* Phase 1: no options. Phase 2 добавит `layout=`. */ + { NULL } +}; + +static const AVClass cuda_grid_class = { + .class_name = "cuda_grid", + .item_name = av_default_item_name, + .option = cuda_grid_options, + .version = LIBAVUTIL_VERSION_INT, + .category = AV_CLASS_CATEGORY_FILTER, +}; + +static const AVFilterPad cuda_grid_inputs[] = { + { .name = "input0", .type = AVMEDIA_TYPE_VIDEO, .config_props = cuda_grid_config_input }, + { .name = "input1", .type = AVMEDIA_TYPE_VIDEO, .config_props = cuda_grid_config_input }, + { .name = "input2", .type = AVMEDIA_TYPE_VIDEO, .config_props = cuda_grid_config_input }, + { .name = "input3", .type = AVMEDIA_TYPE_VIDEO, .config_props = cuda_grid_config_input }, +}; + +static const AVFilterPad cuda_grid_outputs[] = { + { .name = "default", .type = AVMEDIA_TYPE_VIDEO, .config_props = cuda_grid_config_output }, +}; + +const AVFilter ff_vf_cuda_grid = { + .name = "cuda_grid", + .description = NULL_IF_CONFIG_SMALL("GPU-native video grid composer (CUDA)."), + .priv_class = &cuda_grid_class, + .priv_size = sizeof(CudaGridContext), + .init = cuda_grid_init, + .uninit = cuda_grid_uninit, + .activate = cuda_grid_activate, + FILTER_INPUTS(cuda_grid_inputs), + FILTER_OUTPUTS(cuda_grid_outputs), + FILTER_SINGLE_PIXFMT(AV_PIX_FMT_CUDA), + .flags = AVFILTER_FLAG_HWDEVICE, + .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE, +};