From 097ca816057223e45dd66b08fdebd01d09b97816 Mon Sep 17 00:00:00 2001 From: gx Date: Tue, 19 May 2026 20:47:00 +0100 Subject: [PATCH 01/14] =?UTF-8?q?vf=5Fcuda=5Fgrid:=20Phase=201=20MVP=20?= =?UTF-8?q?=E2=80=94=20fixed=20quad=20layout,=204=20CUDA=20inputs=20?= =?UTF-8?q?=E2=86=92=201=20output?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 1 deliverable (см. gx/vf-cuda-grid#1): - libavfilter/vf_cuda_grid.c (~270 LOC): multi-input filter, fixed 2×2 quad - 4 NV12 CUDA frames same size → 2W × 2H output frame - Composition: cuMemcpy2DAsync per Y + UV plane на каждый input - framesync для lock-step pull всех 4 inputs - Output hw_frames_ctx allocated from input device_ref - Build wiring: CONFIG_CUDA_GRID_FILTER → libavfilter/{Makefile,allfilters.c}, configure deps на ffnvcodec Limitations Phase 1: - All inputs must be same size (no scaling) - Quad layout hardcoded (no DSL, no runtime switching) - NV12 only (no RGBA/YUV420P) Phase 2: dynamic layouts + scaling. Phase 3: runtime control via process_command. --- configure | 1 + libavfilter/Makefile | 1 + libavfilter/allfilters.c | 1 + libavfilter/vf_cuda_grid.c | 359 +++++++++++++++++++++++++++++++++++++ 4 files changed, 362 insertions(+) create mode 100644 libavfilter/vf_cuda_grid.c diff --git a/configure b/configure index c24aa94..9c60cb7 100755 --- a/configure +++ b/configure @@ -3317,6 +3317,7 @@ thumbnail_cuda_filter_deps_any="cuda_nvcc cuda_llvm" transpose_npp_filter_deps="ffnvcodec libnpp" overlay_cuda_filter_deps="ffnvcodec" overlay_cuda_filter_deps_any="cuda_nvcc cuda_llvm" +cuda_grid_filter_deps="ffnvcodec" sharpen_npp_filter_deps="ffnvcodec libnpp" ddagrab_filter_deps="d3d11va IDXGIOutput1 DXGI_OUTDUPL_FRAME_INFO" diff --git a/libavfilter/Makefile b/libavfilter/Makefile index 91487af..8015a49 100644 --- a/libavfilter/Makefile +++ b/libavfilter/Makefile @@ -410,6 +410,7 @@ OBJS-$(CONFIG_OSCILLOSCOPE_FILTER) += vf_datascope.o OBJS-$(CONFIG_OVERLAY_FILTER) += vf_overlay.o framesync.o OBJS-$(CONFIG_OVERLAY_CUDA_FILTER) += vf_overlay_cuda.o framesync.o vf_overlay_cuda.ptx.o \ cuda/load_helper.o +OBJS-$(CONFIG_CUDA_GRID_FILTER) += vf_cuda_grid.o framesync.o OBJS-$(CONFIG_OVERLAY_OPENCL_FILTER) += vf_overlay_opencl.o opencl.o \ opencl/overlay.o framesync.o OBJS-$(CONFIG_OVERLAY_QSV_FILTER) += vf_overlay_qsv.o framesync.o diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c index 9819f0f..bc0d00a 100644 --- a/libavfilter/allfilters.c +++ b/libavfilter/allfilters.c @@ -390,6 +390,7 @@ extern const AVFilter ff_vf_overlay_qsv; extern const AVFilter ff_vf_overlay_vaapi; extern const AVFilter ff_vf_overlay_vulkan; extern const AVFilter ff_vf_overlay_cuda; +extern const AVFilter ff_vf_cuda_grid; extern const AVFilter ff_vf_owdenoise; extern const AVFilter ff_vf_pad; extern const AVFilter ff_vf_pad_opencl; diff --git a/libavfilter/vf_cuda_grid.c b/libavfilter/vf_cuda_grid.c new file mode 100644 index 0000000..34003fe --- /dev/null +++ b/libavfilter/vf_cuda_grid.c @@ -0,0 +1,359 @@ +/* + * cuda_grid — GPU-native video grid composer для FFmpeg 7.x. + * + * Принимает N CUDA-frames на входе, выдаёт один composed frame с N cells + * в layout. End-to-end CUDA (без CPU round-trip). + * + * Phase 1 (MVP): fixed quad layout 2×2, 4 NV12-inputs одинакового размера, + * output size = 2W × 2H, без scaling. Композиция через cuMemcpy2DAsync per + * Y/UV plane на каждый input → soответствующую quadrant'у output. + * + * Future phases (см. gx/vf-cuda-grid#1): + * - Phase 2: dynamic layouts + per-cell scaling + * - Phase 3: runtime layout switching через process_command (ZMQ) + * - Phase 4+: overlay primitives (rect/text/icon/image/dim/graph/chat) + * + * Лицензия: LGPL-2.1+ (соответствует FFmpeg) + */ + +#include "config_components.h" + +#include "libavutil/common.h" +#include "libavutil/hwcontext.h" +#include "libavutil/hwcontext_cuda_internal.h" +#include "libavutil/log.h" +#include "libavutil/mem.h" +#include "libavutil/opt.h" +#include "libavutil/pixdesc.h" + +#include "avfilter.h" +#include "filters.h" +#include "formats.h" +#include "framesync.h" +#include "video.h" + +#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x) + +#define CUDA_GRID_INPUTS 4 /* Phase 1: fixed quad */ + +typedef struct CudaGridContext { + const AVClass *class; + + AVBufferRef *hw_device_ctx; + AVCUDADeviceContext *hwctx; + CUcontext cu_ctx; + CUstream cu_stream; + + FFFrameSync fs; + + /* Output dimensions (computed in config_output) */ + int out_width; + int out_height; + + /* Per-cell target rectangles в output frame. + * Phase 1 hardcode: 4 ячейки 2×2 (top-left, top-right, bottom-left, bottom-right). */ + struct { + int x, y, w, h; + } cells[CUDA_GRID_INPUTS]; +} CudaGridContext; + +/* ─── Composition: copy одного input plane в target region output ──────── */ + +static int copy_input_plane(AVFilterContext *ctx, + CUdeviceptr src_data, + int src_pitch, + int src_w, + int src_h, + CUdeviceptr dst_data, + int dst_pitch, + int dst_x, + int dst_y, + int bytes_per_pixel) +{ + CudaGridContext *s = ctx->priv; + CUDA_MEMCPY2D cpy = { + .srcMemoryType = CU_MEMORYTYPE_DEVICE, + .srcDevice = src_data, + .srcPitch = src_pitch, + .dstMemoryType = CU_MEMORYTYPE_DEVICE, + .dstDevice = dst_data, + .dstXInBytes = (size_t)dst_x * bytes_per_pixel, + .dstY = dst_y, + .dstPitch = dst_pitch, + .WidthInBytes = (size_t)src_w * bytes_per_pixel, + .Height = src_h, + }; + return CHECK_CU(s->hwctx->internal->cuda_dl->cuMemcpy2DAsync(&cpy, s->cu_stream)); +} + +/* ─── Framesync callback — N frames аre ready, compose ────────────────── */ + +static int cuda_grid_compose(FFFrameSync *fs) +{ + AVFilterContext *ctx = fs->parent; + AVFilterLink *outlink = ctx->outputs[0]; + CudaGridContext *s = ctx->priv; + AVFrame *out = NULL; + AVFrame *in[CUDA_GRID_INPUTS] = {0}; + CUcontext dummy; + int ret; + + /* Сбор всех N input frames из framesync */ + for (int i = 0; i < CUDA_GRID_INPUTS; i++) { + ret = ff_framesync_get_frame(fs, i, &in[i], 0); + if (ret < 0) + return ret; + if (!in[i]) { + av_log(ctx, AV_LOG_WARNING, "input %d not ready, skipping frame\n", i); + return 0; + } + } + + /* Output frame из output's hw_frames_pool */ + out = ff_get_video_buffer(outlink, s->out_width, s->out_height); + if (!out) + return AVERROR(ENOMEM); + + /* Copy props (timestamps, color metadata) от первого input */ + ret = av_frame_copy_props(out, in[0]); + if (ret < 0) + goto fail; + out->width = s->out_width; + out->height = s->out_height; + + /* CUDA context push для всех cuMemcpy в этом filter call */ + ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPushCurrent(s->cu_ctx)); + if (ret < 0) + goto fail; + + /* Для каждого input — copy Y plane + UV plane в свою quadrant. + * NV12 layout: data[0] = Y, data[1] = UV interleaved. linesize[0/1] = pitch. */ + for (int i = 0; i < CUDA_GRID_INPUTS; i++) { + AVFrame *src = in[i]; + int cx = s->cells[i].x; + int cy = s->cells[i].y; + int cw = s->cells[i].w; + int ch = s->cells[i].h; + + if (src->width != cw || src->height != ch) { + av_log(ctx, AV_LOG_ERROR, + "input %d size %dx%d != expected cell size %dx%d " + "(Phase 1: no scaling, all inputs must match cell size)\n", + i, src->width, src->height, cw, ch); + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); + ret = AVERROR(EINVAL); + goto fail; + } + + /* Y plane (full resolution, 1 byte per pixel) */ + ret = copy_input_plane(ctx, + (CUdeviceptr)src->data[0], src->linesize[0], + src->width, src->height, + (CUdeviceptr)out->data[0], out->linesize[0], + cx, cy, 1); + if (ret < 0) { + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); + goto fail; + } + + /* UV plane (half resolution для NV12, но 2 bytes per "pixel" — interleaved UV) */ + ret = copy_input_plane(ctx, + (CUdeviceptr)src->data[1], src->linesize[1], + src->width / 2, src->height / 2, + (CUdeviceptr)out->data[1], out->linesize[1], + cx / 2, cy / 2, 2); + if (ret < 0) { + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); + goto fail; + } + } + + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); + + return ff_filter_frame(outlink, out); + +fail: + av_frame_free(&out); + return ret; +} + +/* ─── Lifecycle: init / uninit / query_formats / config_input / config_output ─ */ + +static av_cold int cuda_grid_init(AVFilterContext *ctx) +{ + CudaGridContext *s = ctx->priv; + /* Сами inputs регистрируем в filter struct (см. внизу — нельзя AVFILTER_INPUT_COUNT_MAX + * без явных AVFilterPad'ов). Phase 1 fix=4. */ + (void)s; + return 0; +} + +static av_cold void cuda_grid_uninit(AVFilterContext *ctx) +{ + CudaGridContext *s = ctx->priv; + ff_framesync_uninit(&s->fs); + av_buffer_unref(&s->hw_device_ctx); +} + +static int cuda_grid_config_input(AVFilterLink *inlink) +{ + AVFilterContext *ctx = inlink->src; + FilterLink *inl = ff_filter_link(inlink); + + if (!inl->hw_frames_ctx || !inl->hw_frames_ctx->data) { + av_log(ctx, AV_LOG_ERROR, "input %d: software pixel format не поддерживается\n", + FF_INLINK_IDX(inlink)); + return AVERROR(EINVAL); + } + return 0; +} + +static int cuda_grid_config_output(AVFilterLink *outlink) +{ + AVFilterContext *ctx = outlink->src; + CudaGridContext *s = ctx->priv; + AVFilterLink *in0 = ctx->inputs[0]; + FilterLink *inl0 = ff_filter_link(in0); + FilterLink *outl = ff_filter_link(outlink); + AVHWFramesContext *hwfc0; + int W, H, ret; + + if (!inl0->hw_frames_ctx) + return AVERROR(EINVAL); + hwfc0 = (AVHWFramesContext *)inl0->hw_frames_ctx->data; + + if (hwfc0->sw_format != AV_PIX_FMT_NV12) { + av_log(ctx, AV_LOG_ERROR, + "Phase 1 supports only NV12, got %s\n", + av_get_pix_fmt_name(hwfc0->sw_format)); + return AVERROR(EINVAL); + } + + /* Все inputs должны иметь одинаковый device и sw_format (Phase 1 also same size) */ + W = in0->w; + H = in0->h; + for (int i = 1; i < CUDA_GRID_INPUTS; i++) { + AVFilterLink *inN = ctx->inputs[i]; + FilterLink *ilN = ff_filter_link(inN); + AVHWFramesContext *hN; + if (!ilN->hw_frames_ctx) + return AVERROR(EINVAL); + hN = (AVHWFramesContext *)ilN->hw_frames_ctx->data; + if (hN->device_ctx != hwfc0->device_ctx) { + av_log(ctx, AV_LOG_ERROR, "input %d device mismatch\n", i); + return AVERROR(EINVAL); + } + if (hN->sw_format != hwfc0->sw_format) { + av_log(ctx, AV_LOG_ERROR, "input %d sw_format mismatch\n", i); + return AVERROR(EINVAL); + } + if (inN->w != W || inN->h != H) { + av_log(ctx, AV_LOG_ERROR, + "Phase 1: input %d size %dx%d != input 0 size %dx%d. " + "В этой фазе scaling не поддерживается, все inputs должны быть одного размера.\n", + i, inN->w, inN->h, W, H); + return AVERROR(EINVAL); + } + } + + /* Output = 2W × 2H для quad layout */ + s->out_width = 2 * W; + s->out_height = 2 * H; + outlink->w = s->out_width; + outlink->h = s->out_height; + + /* Hardcoded quad cell positions */ + s->cells[0].x = 0; s->cells[0].y = 0; s->cells[0].w = W; s->cells[0].h = H; + s->cells[1].x = W; s->cells[1].y = 0; s->cells[1].w = W; s->cells[1].h = H; + s->cells[2].x = 0; s->cells[2].y = H; s->cells[2].w = W; s->cells[2].h = H; + s->cells[3].x = W; s->cells[3].y = H; s->cells[3].w = W; s->cells[3].h = H; + + /* Setup CUDA device + stream context из input 0 */ + AVHWDeviceContext *hwdev = hwfc0->device_ctx; + s->hwctx = (AVCUDADeviceContext *)hwdev->hwctx; + s->cu_ctx = s->hwctx->cuda_ctx; + s->cu_stream = s->hwctx->stream; + + /* Аллокация output hw_frames_ctx — copy от input #0 с обновлёнными размерами */ + AVBufferRef *out_ref = av_hwframe_ctx_alloc(hwfc0->device_ref); + if (!out_ref) + return AVERROR(ENOMEM); + AVHWFramesContext *out_hwfc = (AVHWFramesContext *)out_ref->data; + out_hwfc->format = AV_PIX_FMT_CUDA; + out_hwfc->sw_format = AV_PIX_FMT_NV12; + out_hwfc->width = s->out_width; + out_hwfc->height = s->out_height; + out_hwfc->initial_pool_size = 4; + + ret = av_hwframe_ctx_init(out_ref); + if (ret < 0) { + av_buffer_unref(&out_ref); + return ret; + } + outl->hw_frames_ctx = out_ref; + + /* Setup framesync для lock-step pull от N inputs */ + ret = ff_framesync_init(&s->fs, ctx, CUDA_GRID_INPUTS); + if (ret < 0) + return ret; + for (int i = 0; i < CUDA_GRID_INPUTS; i++) { + FFFrameSyncIn *fs_in = &s->fs.in[i]; + fs_in->time_base = ctx->inputs[i]->time_base; + fs_in->sync = 1; + fs_in->before = EXT_STOP; + fs_in->after = EXT_INFINITY; + } + s->fs.opaque = s; + s->fs.on_event = cuda_grid_compose; + + outlink->time_base = ctx->inputs[0]->time_base; + + return ff_framesync_configure(&s->fs); +} + +static int cuda_grid_activate(AVFilterContext *ctx) +{ + CudaGridContext *s = ctx->priv; + return ff_framesync_activate(&s->fs); +} + +/* ─── Filter registration ──────────────────────────────────────────────── */ + +static const AVOption cuda_grid_options[] = { + /* Phase 1: no options. Phase 2 добавит `layout=`. */ + { NULL } +}; + +static const AVClass cuda_grid_class = { + .class_name = "cuda_grid", + .item_name = av_default_item_name, + .option = cuda_grid_options, + .version = LIBAVUTIL_VERSION_INT, + .category = AV_CLASS_CATEGORY_FILTER, +}; + +static const AVFilterPad cuda_grid_inputs[] = { + { .name = "input0", .type = AVMEDIA_TYPE_VIDEO, .config_props = cuda_grid_config_input }, + { .name = "input1", .type = AVMEDIA_TYPE_VIDEO, .config_props = cuda_grid_config_input }, + { .name = "input2", .type = AVMEDIA_TYPE_VIDEO, .config_props = cuda_grid_config_input }, + { .name = "input3", .type = AVMEDIA_TYPE_VIDEO, .config_props = cuda_grid_config_input }, +}; + +static const AVFilterPad cuda_grid_outputs[] = { + { .name = "default", .type = AVMEDIA_TYPE_VIDEO, .config_props = cuda_grid_config_output }, +}; + +const AVFilter ff_vf_cuda_grid = { + .name = "cuda_grid", + .description = NULL_IF_CONFIG_SMALL("GPU-native video grid composer (CUDA)."), + .priv_class = &cuda_grid_class, + .priv_size = sizeof(CudaGridContext), + .init = cuda_grid_init, + .uninit = cuda_grid_uninit, + .activate = cuda_grid_activate, + FILTER_INPUTS(cuda_grid_inputs), + FILTER_OUTPUTS(cuda_grid_outputs), + FILTER_SINGLE_PIXFMT(AV_PIX_FMT_CUDA), + .flags = AVFILTER_FLAG_HWDEVICE, + .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE, +}; -- 2.52.0 From 4313c3f30d3adac36d73d239ff85069a30632e2b Mon Sep 17 00:00:00 2001 From: gx Date: Tue, 19 May 2026 20:50:17 +0100 Subject: [PATCH 02/14] vf_cuda_grid: fix #include cuda_check.h + mixed decl warnings (-Werror) --- libavfilter/vf_cuda_grid.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/libavfilter/vf_cuda_grid.c b/libavfilter/vf_cuda_grid.c index 34003fe..60a5ccd 100644 --- a/libavfilter/vf_cuda_grid.c +++ b/libavfilter/vf_cuda_grid.c @@ -19,6 +19,7 @@ #include "config_components.h" #include "libavutil/common.h" +#include "libavutil/cuda_check.h" #include "libavutil/hwcontext.h" #include "libavutil/hwcontext_cuda_internal.h" #include "libavutil/log.h" @@ -201,8 +202,8 @@ static int cuda_grid_config_input(AVFilterLink *inlink) FilterLink *inl = ff_filter_link(inlink); if (!inl->hw_frames_ctx || !inl->hw_frames_ctx->data) { - av_log(ctx, AV_LOG_ERROR, "input %d: software pixel format не поддерживается\n", - FF_INLINK_IDX(inlink)); + av_log(ctx, AV_LOG_ERROR, "input %s: software pixel format не поддерживается\n", + inlink->dstpad->name); return AVERROR(EINVAL); } return 0; @@ -216,6 +217,9 @@ static int cuda_grid_config_output(AVFilterLink *outlink) FilterLink *inl0 = ff_filter_link(in0); FilterLink *outl = ff_filter_link(outlink); AVHWFramesContext *hwfc0; + AVHWDeviceContext *hwdev; + AVBufferRef *out_ref; + AVHWFramesContext *out_hwfc; int W, H, ret; if (!inl0->hw_frames_ctx) @@ -269,16 +273,16 @@ static int cuda_grid_config_output(AVFilterLink *outlink) s->cells[3].x = W; s->cells[3].y = H; s->cells[3].w = W; s->cells[3].h = H; /* Setup CUDA device + stream context из input 0 */ - AVHWDeviceContext *hwdev = hwfc0->device_ctx; + hwdev = hwfc0->device_ctx; s->hwctx = (AVCUDADeviceContext *)hwdev->hwctx; s->cu_ctx = s->hwctx->cuda_ctx; s->cu_stream = s->hwctx->stream; /* Аллокация output hw_frames_ctx — copy от input #0 с обновлёнными размерами */ - AVBufferRef *out_ref = av_hwframe_ctx_alloc(hwfc0->device_ref); + out_ref = av_hwframe_ctx_alloc(hwfc0->device_ref); if (!out_ref) return AVERROR(ENOMEM); - AVHWFramesContext *out_hwfc = (AVHWFramesContext *)out_ref->data; + out_hwfc = (AVHWFramesContext *)out_ref->data; out_hwfc->format = AV_PIX_FMT_CUDA; out_hwfc->sw_format = AV_PIX_FMT_NV12; out_hwfc->width = s->out_width; @@ -296,12 +300,16 @@ static int cuda_grid_config_output(AVFilterLink *outlink) ret = ff_framesync_init(&s->fs, ctx, CUDA_GRID_INPUTS); if (ret < 0) return ret; - for (int i = 0; i < CUDA_GRID_INPUTS; i++) { - FFFrameSyncIn *fs_in = &s->fs.in[i]; - fs_in->time_base = ctx->inputs[i]->time_base; - fs_in->sync = 1; - fs_in->before = EXT_STOP; - fs_in->after = EXT_INFINITY; + { + int i; + FFFrameSyncIn *fs_in; + for (i = 0; i < CUDA_GRID_INPUTS; i++) { + fs_in = &s->fs.in[i]; + fs_in->time_base = ctx->inputs[i]->time_base; + fs_in->sync = 1; + fs_in->before = EXT_STOP; + fs_in->after = EXT_INFINITY; + } } s->fs.opaque = s; s->fs.on_event = cuda_grid_compose; -- 2.52.0 From 6ee2f474c7624d1414acf9e94702ba3974817e18 Mon Sep 17 00:00:00 2001 From: gx Date: Tue, 19 May 2026 20:57:08 +0100 Subject: [PATCH 03/14] =?UTF-8?q?vf=5Fcuda=5Fgrid:=20Phase=202a=20?= =?UTF-8?q?=E2=80=94=20layout=20templates=20+=20dynamic=20nb=5Finputs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Layout templates (9): - single, dual_horizontal, dual_vertical - quad (default), main_plus_preview (1 big + 3 small) - six_grid (3x2), nine_grid (3x3), sixteen_grid (4x4) - panoramic Cells определены в normalized координатах (0.0-1.0), переводятся в pixels в config_output (× out_w/out_h). Alignment до chroma boundary (NV12 ÷ 2). Filter options: - layout= (default quad) - out_w= (default 1920) - out_h= (default 1080) Dynamic inputs: - nb_inputs derived из layout (single=1, quad=4, nine_grid=9, sixteen_grid=16) - ff_append_inpad_free_name в init() для каждой cell - AVFILTER_FLAG_DYNAMIC_INPUTS на filter Phase 2a limitation: - Каждый input должен быть точно cell_px size (no scaling). - Phase 2b добавит NPP resize для mixed-size inputs. --- libavfilter/vf_cuda_grid.c | 254 +++++++++++++++++++++++++------------ 1 file changed, 176 insertions(+), 78 deletions(-) diff --git a/libavfilter/vf_cuda_grid.c b/libavfilter/vf_cuda_grid.c index 60a5ccd..9537b54 100644 --- a/libavfilter/vf_cuda_grid.c +++ b/libavfilter/vf_cuda_grid.c @@ -4,12 +4,13 @@ * Принимает N CUDA-frames на входе, выдаёт один composed frame с N cells * в layout. End-to-end CUDA (без CPU round-trip). * - * Phase 1 (MVP): fixed quad layout 2×2, 4 NV12-inputs одинакового размера, - * output size = 2W × 2H, без scaling. Композиция через cuMemcpy2DAsync per - * Y/UV plane на каждый input → soответствующую quadrant'у output. + * Phase 2a: layout templates (single/dual_h/dual_v/quad/main_plus_preview/ + * six_grid/nine_grid/sixteen_grid/panoramic), dynamic nb_inputs, output size + * через option (default 1920×1080). Cell rects = normalized × output size. + * **Scaling пока нет** — каждый input должен быть точно cell size (Phase 2b NPP). * * Future phases (см. gx/vf-cuda-grid#1): - * - Phase 2: dynamic layouts + per-cell scaling + * - Phase 2b: per-cell scaling через libnpp (mixed-size inputs) * - Phase 3: runtime layout switching через process_command (ZMQ) * - Phase 4+: overlay primitives (rect/text/icon/image/dim/graph/chat) * @@ -34,28 +35,115 @@ #include "video.h" #define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x) +#define MAX_CELLS 16 -#define CUDA_GRID_INPUTS 4 /* Phase 1: fixed quad */ +/* ─── Layout templates (normalized координаты 0.0–1.0) ─────────────────── */ + +typedef struct LayoutCell { + float x, y, w, h; /* normalized fraction of output size */ +} LayoutCell; + +typedef struct LayoutTemplate { + const char *name; + int nb_cells; + LayoutCell cells[MAX_CELLS]; +} LayoutTemplate; + +/* Layouts copy-paste'нуты по структуре cctv-processor config/grids.json. */ +static const LayoutTemplate layouts[] = { + { + "single", 1, + { {0.0f, 0.0f, 1.0f, 1.0f} } + }, + { + "dual_horizontal", 2, + { {0.0f, 0.0f, 0.5f, 1.0f}, {0.5f, 0.0f, 0.5f, 1.0f} } + }, + { + "dual_vertical", 2, + { {0.0f, 0.0f, 1.0f, 0.5f}, {0.0f, 0.5f, 1.0f, 0.5f} } + }, + { + "quad", 4, + { + {0.0f, 0.0f, 0.5f, 0.5f}, {0.5f, 0.0f, 0.5f, 0.5f}, + {0.0f, 0.5f, 0.5f, 0.5f}, {0.5f, 0.5f, 0.5f, 0.5f}, + } + }, + { + /* Main camera 2/3 width, 3 small cameras stacked справа сверху вниз */ + "main_plus_preview", 4, + { + {0.0f, 0.0f, 2.0f/3, 1.0f}, + {2.0f/3, 0.0f, 1.0f/3, 1.0f/3}, + {2.0f/3, 1.0f/3, 1.0f/3, 1.0f/3}, + {2.0f/3, 2.0f/3, 1.0f/3, 1.0f/3}, + } + }, + { + "six_grid", 6, + { + {0.0f, 0.0f, 1.0f/3, 0.5f}, {1.0f/3, 0.0f, 1.0f/3, 0.5f}, {2.0f/3, 0.0f, 1.0f/3, 0.5f}, + {0.0f, 0.5f, 1.0f/3, 0.5f}, {1.0f/3, 0.5f, 1.0f/3, 0.5f}, {2.0f/3, 0.5f, 1.0f/3, 0.5f}, + } + }, + { + "nine_grid", 9, + { + {0.0f, 0.0f, 1.0f/3, 1.0f/3}, {1.0f/3, 0.0f, 1.0f/3, 1.0f/3}, {2.0f/3, 0.0f, 1.0f/3, 1.0f/3}, + {0.0f, 1.0f/3, 1.0f/3, 1.0f/3}, {1.0f/3, 1.0f/3, 1.0f/3, 1.0f/3}, {2.0f/3, 1.0f/3, 1.0f/3, 1.0f/3}, + {0.0f, 2.0f/3, 1.0f/3, 1.0f/3}, {1.0f/3, 2.0f/3, 1.0f/3, 1.0f/3}, {2.0f/3, 2.0f/3, 1.0f/3, 1.0f/3}, + } + }, + { + "sixteen_grid", 16, + { + {0.00f, 0.00f, 0.25f, 0.25f}, {0.25f, 0.00f, 0.25f, 0.25f}, {0.50f, 0.00f, 0.25f, 0.25f}, {0.75f, 0.00f, 0.25f, 0.25f}, + {0.00f, 0.25f, 0.25f, 0.25f}, {0.25f, 0.25f, 0.25f, 0.25f}, {0.50f, 0.25f, 0.25f, 0.25f}, {0.75f, 0.25f, 0.25f, 0.25f}, + {0.00f, 0.50f, 0.25f, 0.25f}, {0.25f, 0.50f, 0.25f, 0.25f}, {0.50f, 0.50f, 0.25f, 0.25f}, {0.75f, 0.50f, 0.25f, 0.25f}, + {0.00f, 0.75f, 0.25f, 0.25f}, {0.25f, 0.75f, 0.25f, 0.25f}, {0.50f, 0.75f, 0.25f, 0.25f}, {0.75f, 0.75f, 0.25f, 0.25f}, + } + }, + { + /* Один widescreen panoramic — 1 cell full width */ + "panoramic", 1, + { {0.0f, 0.0f, 1.0f, 1.0f} } + }, +}; + +static const LayoutTemplate *find_layout(const char *name) +{ + for (size_t i = 0; i < FF_ARRAY_ELEMS(layouts); i++) { + if (!strcmp(layouts[i].name, name)) + return &layouts[i]; + } + return NULL; +} + +/* ─── Filter state ─────────────────────────────────────────────────────── */ typedef struct CudaGridContext { const AVClass *class; - AVBufferRef *hw_device_ctx; + /* Options */ + char *layout_name; + int out_width; + int out_height; + + /* Resolved layout (после init) */ + const LayoutTemplate *layout; + + /* CUDA */ AVCUDADeviceContext *hwctx; CUcontext cu_ctx; CUstream cu_stream; FFFrameSync fs; - /* Output dimensions (computed in config_output) */ - int out_width; - int out_height; - - /* Per-cell target rectangles в output frame. - * Phase 1 hardcode: 4 ячейки 2×2 (top-left, top-right, bottom-left, bottom-right). */ + /* Per-cell pixel rects (computed в config_output из normalized × out size) */ struct { int x, y, w, h; - } cells[CUDA_GRID_INPUTS]; + } cell_px[MAX_CELLS]; } CudaGridContext; /* ─── Composition: copy одного input plane в target region output ──────── */ @@ -87,7 +175,7 @@ static int copy_input_plane(AVFilterContext *ctx, return CHECK_CU(s->hwctx->internal->cuda_dl->cuMemcpy2DAsync(&cpy, s->cu_stream)); } -/* ─── Framesync callback — N frames аre ready, compose ────────────────── */ +/* ─── Framesync callback ──────────────────────────────────────────────── */ static int cuda_grid_compose(FFFrameSync *fs) { @@ -95,12 +183,12 @@ static int cuda_grid_compose(FFFrameSync *fs) AVFilterLink *outlink = ctx->outputs[0]; CudaGridContext *s = ctx->priv; AVFrame *out = NULL; - AVFrame *in[CUDA_GRID_INPUTS] = {0}; + AVFrame *in[MAX_CELLS] = {0}; CUcontext dummy; - int ret; + int i, ret; + int nb = s->layout->nb_cells; - /* Сбор всех N input frames из framesync */ - for (int i = 0; i < CUDA_GRID_INPUTS; i++) { + for (i = 0; i < nb; i++) { ret = ff_framesync_get_frame(fs, i, &in[i], 0); if (ret < 0) return ret; @@ -110,43 +198,38 @@ static int cuda_grid_compose(FFFrameSync *fs) } } - /* Output frame из output's hw_frames_pool */ out = ff_get_video_buffer(outlink, s->out_width, s->out_height); if (!out) return AVERROR(ENOMEM); - /* Copy props (timestamps, color metadata) от первого input */ ret = av_frame_copy_props(out, in[0]); if (ret < 0) goto fail; out->width = s->out_width; out->height = s->out_height; - /* CUDA context push для всех cuMemcpy в этом filter call */ ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPushCurrent(s->cu_ctx)); if (ret < 0) goto fail; - /* Для каждого input — copy Y plane + UV plane в свою quadrant. - * NV12 layout: data[0] = Y, data[1] = UV interleaved. linesize[0/1] = pitch. */ - for (int i = 0; i < CUDA_GRID_INPUTS; i++) { + for (i = 0; i < nb; i++) { AVFrame *src = in[i]; - int cx = s->cells[i].x; - int cy = s->cells[i].y; - int cw = s->cells[i].w; - int ch = s->cells[i].h; + int cx = s->cell_px[i].x; + int cy = s->cell_px[i].y; + int cw = s->cell_px[i].w; + int ch = s->cell_px[i].h; if (src->width != cw || src->height != ch) { av_log(ctx, AV_LOG_ERROR, - "input %d size %dx%d != expected cell size %dx%d " - "(Phase 1: no scaling, all inputs must match cell size)\n", + "input %d size %dx%d != cell size %dx%d " + "(Phase 2a: no scaling — Phase 2b добавит NPP resize)\n", i, src->width, src->height, cw, ch); CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); ret = AVERROR(EINVAL); goto fail; } - /* Y plane (full resolution, 1 byte per pixel) */ + /* Y plane */ ret = copy_input_plane(ctx, (CUdeviceptr)src->data[0], src->linesize[0], src->width, src->height, @@ -157,7 +240,7 @@ static int cuda_grid_compose(FFFrameSync *fs) goto fail; } - /* UV plane (half resolution для NV12, но 2 bytes per "pixel" — interleaved UV) */ + /* UV plane (NV12: половинное разрешение, 2 bytes per "pixel") */ ret = copy_input_plane(ctx, (CUdeviceptr)src->data[1], src->linesize[1], src->width / 2, src->height / 2, @@ -170,7 +253,6 @@ static int cuda_grid_compose(FFFrameSync *fs) } CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); - return ff_filter_frame(outlink, out); fail: @@ -178,14 +260,38 @@ fail: return ret; } -/* ─── Lifecycle: init / uninit / query_formats / config_input / config_output ─ */ +/* ─── Lifecycle ────────────────────────────────────────────────────────── */ static av_cold int cuda_grid_init(AVFilterContext *ctx) { CudaGridContext *s = ctx->priv; - /* Сами inputs регистрируем в filter struct (см. внизу — нельзя AVFILTER_INPUT_COUNT_MAX - * без явных AVFilterPad'ов). Phase 1 fix=4. */ - (void)s; + const LayoutTemplate *lt; + int i, ret; + + lt = find_layout(s->layout_name); + if (!lt) { + av_log(ctx, AV_LOG_ERROR, "unknown layout '%s'. Доступны: ", s->layout_name); + for (i = 0; i < (int)FF_ARRAY_ELEMS(layouts); i++) + av_log(ctx, AV_LOG_ERROR, "%s ", layouts[i].name); + av_log(ctx, AV_LOG_ERROR, "\n"); + return AVERROR(EINVAL); + } + s->layout = lt; + + /* Dynamic inputs — append pad per cell */ + for (i = 0; i < lt->nb_cells; i++) { + AVFilterPad pad = { 0 }; + pad.type = AVMEDIA_TYPE_VIDEO; + pad.name = av_asprintf("input%d", i); + if (!pad.name) + return AVERROR(ENOMEM); + ret = ff_append_inpad_free_name(ctx, &pad); + if (ret < 0) + return ret; + } + + av_log(ctx, AV_LOG_INFO, "cuda_grid layout=%s cells=%d output=%dx%d\n", + lt->name, lt->nb_cells, s->out_width, s->out_height); return 0; } @@ -193,7 +299,6 @@ static av_cold void cuda_grid_uninit(AVFilterContext *ctx) { CudaGridContext *s = ctx->priv; ff_framesync_uninit(&s->fs); - av_buffer_unref(&s->hw_device_ctx); } static int cuda_grid_config_input(AVFilterLink *inlink) @@ -220,7 +325,7 @@ static int cuda_grid_config_output(AVFilterLink *outlink) AVHWDeviceContext *hwdev; AVBufferRef *out_ref; AVHWFramesContext *out_hwfc; - int W, H, ret; + int i, ret; if (!inl0->hw_frames_ctx) return AVERROR(EINVAL); @@ -228,15 +333,28 @@ static int cuda_grid_config_output(AVFilterLink *outlink) if (hwfc0->sw_format != AV_PIX_FMT_NV12) { av_log(ctx, AV_LOG_ERROR, - "Phase 1 supports only NV12, got %s\n", + "Phase 1-2a поддерживают только NV12, got %s\n", av_get_pix_fmt_name(hwfc0->sw_format)); return AVERROR(EINVAL); } - /* Все inputs должны иметь одинаковый device и sw_format (Phase 1 also same size) */ - W = in0->w; - H = in0->h; - for (int i = 1; i < CUDA_GRID_INPUTS; i++) { + /* Compute pixel rects из normalized layout × output size */ + for (i = 0; i < s->layout->nb_cells; i++) { + s->cell_px[i].x = (int)(s->layout->cells[i].x * s->out_width); + s->cell_px[i].y = (int)(s->layout->cells[i].y * s->out_height); + s->cell_px[i].w = (int)(s->layout->cells[i].w * s->out_width); + s->cell_px[i].h = (int)(s->layout->cells[i].h * s->out_height); + /* Align до chroma boundary (NV12 → 2x2) */ + s->cell_px[i].x &= ~1; + s->cell_px[i].y &= ~1; + s->cell_px[i].w &= ~1; + s->cell_px[i].h &= ~1; + av_log(ctx, AV_LOG_VERBOSE, " cell[%d] = %dx%d @ (%d,%d)\n", + i, s->cell_px[i].w, s->cell_px[i].h, s->cell_px[i].x, s->cell_px[i].y); + } + + /* Validate все inputs: device match + sw_format match */ + for (i = 1; i < s->layout->nb_cells; i++) { AVFilterLink *inN = ctx->inputs[i]; FilterLink *ilN = ff_filter_link(inN); AVHWFramesContext *hN; @@ -251,34 +369,16 @@ static int cuda_grid_config_output(AVFilterLink *outlink) av_log(ctx, AV_LOG_ERROR, "input %d sw_format mismatch\n", i); return AVERROR(EINVAL); } - if (inN->w != W || inN->h != H) { - av_log(ctx, AV_LOG_ERROR, - "Phase 1: input %d size %dx%d != input 0 size %dx%d. " - "В этой фазе scaling не поддерживается, все inputs должны быть одного размера.\n", - i, inN->w, inN->h, W, H); - return AVERROR(EINVAL); - } } - /* Output = 2W × 2H для quad layout */ - s->out_width = 2 * W; - s->out_height = 2 * H; outlink->w = s->out_width; outlink->h = s->out_height; - /* Hardcoded quad cell positions */ - s->cells[0].x = 0; s->cells[0].y = 0; s->cells[0].w = W; s->cells[0].h = H; - s->cells[1].x = W; s->cells[1].y = 0; s->cells[1].w = W; s->cells[1].h = H; - s->cells[2].x = 0; s->cells[2].y = H; s->cells[2].w = W; s->cells[2].h = H; - s->cells[3].x = W; s->cells[3].y = H; s->cells[3].w = W; s->cells[3].h = H; - - /* Setup CUDA device + stream context из input 0 */ hwdev = hwfc0->device_ctx; s->hwctx = (AVCUDADeviceContext *)hwdev->hwctx; s->cu_ctx = s->hwctx->cuda_ctx; s->cu_stream = s->hwctx->stream; - /* Аллокация output hw_frames_ctx — copy от input #0 с обновлёнными размерами */ out_ref = av_hwframe_ctx_alloc(hwfc0->device_ref); if (!out_ref) return AVERROR(ENOMEM); @@ -296,14 +396,12 @@ static int cuda_grid_config_output(AVFilterLink *outlink) } outl->hw_frames_ctx = out_ref; - /* Setup framesync для lock-step pull от N inputs */ - ret = ff_framesync_init(&s->fs, ctx, CUDA_GRID_INPUTS); + ret = ff_framesync_init(&s->fs, ctx, s->layout->nb_cells); if (ret < 0) return ret; { - int i; FFFrameSyncIn *fs_in; - for (i = 0; i < CUDA_GRID_INPUTS; i++) { + for (i = 0; i < s->layout->nb_cells; i++) { fs_in = &s->fs.in[i]; fs_in->time_base = ctx->inputs[i]->time_base; fs_in->sync = 1; @@ -315,7 +413,6 @@ static int cuda_grid_config_output(AVFilterLink *outlink) s->fs.on_event = cuda_grid_compose; outlink->time_base = ctx->inputs[0]->time_base; - return ff_framesync_configure(&s->fs); } @@ -325,10 +422,18 @@ static int cuda_grid_activate(AVFilterContext *ctx) return ff_framesync_activate(&s->fs); } -/* ─── Filter registration ──────────────────────────────────────────────── */ +/* ─── Options + registration ───────────────────────────────────────────── */ + +#define OFFSET(x) offsetof(CudaGridContext, x) +#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM) static const AVOption cuda_grid_options[] = { - /* Phase 1: no options. Phase 2 добавит `layout=`. */ + { "layout", "имя layout template", + OFFSET(layout_name), AV_OPT_TYPE_STRING, { .str = "quad" }, 0, 0, FLAGS }, + { "out_w", "ширина output frame в пикселях", + OFFSET(out_width), AV_OPT_TYPE_INT, { .i64 = 1920 }, 16, 16384, FLAGS }, + { "out_h", "высота output frame в пикселях", + OFFSET(out_height), AV_OPT_TYPE_INT, { .i64 = 1080 }, 16, 16384, FLAGS }, { NULL } }; @@ -340,13 +445,6 @@ static const AVClass cuda_grid_class = { .category = AV_CLASS_CATEGORY_FILTER, }; -static const AVFilterPad cuda_grid_inputs[] = { - { .name = "input0", .type = AVMEDIA_TYPE_VIDEO, .config_props = cuda_grid_config_input }, - { .name = "input1", .type = AVMEDIA_TYPE_VIDEO, .config_props = cuda_grid_config_input }, - { .name = "input2", .type = AVMEDIA_TYPE_VIDEO, .config_props = cuda_grid_config_input }, - { .name = "input3", .type = AVMEDIA_TYPE_VIDEO, .config_props = cuda_grid_config_input }, -}; - static const AVFilterPad cuda_grid_outputs[] = { { .name = "default", .type = AVMEDIA_TYPE_VIDEO, .config_props = cuda_grid_config_output }, }; @@ -359,9 +457,9 @@ const AVFilter ff_vf_cuda_grid = { .init = cuda_grid_init, .uninit = cuda_grid_uninit, .activate = cuda_grid_activate, - FILTER_INPUTS(cuda_grid_inputs), + /* No FILTER_INPUTS — pads added dynamically в init() per layout. */ FILTER_OUTPUTS(cuda_grid_outputs), FILTER_SINGLE_PIXFMT(AV_PIX_FMT_CUDA), - .flags = AVFILTER_FLAG_HWDEVICE, + .flags = AVFILTER_FLAG_HWDEVICE | AVFILTER_FLAG_DYNAMIC_INPUTS, .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE, }; -- 2.52.0 From df476472e2e311f06e81c0669f10fca7ceb97e29 Mon Sep 17 00:00:00 2001 From: gx Date: Tue, 19 May 2026 20:58:29 +0100 Subject: [PATCH 04/14] =?UTF-8?q?vf=5Fcuda=5Fgrid:=20fix=20include=20avstr?= =?UTF-8?q?ing.h=20=D0=B4=D0=BB=D1=8F=20av=5Fasprintf?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- libavfilter/vf_cuda_grid.c | 1 + 1 file changed, 1 insertion(+) diff --git a/libavfilter/vf_cuda_grid.c b/libavfilter/vf_cuda_grid.c index 9537b54..bfc4a53 100644 --- a/libavfilter/vf_cuda_grid.c +++ b/libavfilter/vf_cuda_grid.c @@ -19,6 +19,7 @@ #include "config_components.h" +#include "libavutil/avstring.h" #include "libavutil/common.h" #include "libavutil/cuda_check.h" #include "libavutil/hwcontext.h" -- 2.52.0 From 11f310061a3ac0290b790c565d8d4cb86cabdbf5 Mon Sep 17 00:00:00 2001 From: gx Date: Tue, 19 May 2026 21:20:04 +0100 Subject: [PATCH 05/14] =?UTF-8?q?vf=5Fcuda=5Fgrid:=20Phase=202b=20?= =?UTF-8?q?=E2=80=94=20NPP=20scaling=20=D0=B4=D0=BB=D1=8F=20mixed-size=20i?= =?UTF-8?q?nputs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add libnpp dependency в configure (cuda_grid_filter_deps) - #include , nppSetStream(s->cu_stream) перед resize batch - Smart copy-or-scale в compose path: - src.size == cell.size → cuMemcpy2DAsync (fast path, zero overhead) - else → nppiResizeSqrPixel_8u_C1R для Y + _C2R для NV12 UV interleaved - NPPI_INTER_LINEAR interpolation (bilinear — стандартный для video) - Destination pointer offset через explicit pointer arithmetic (NPP не имеет dst_offset параметра, нужно сместить pSrc указатель) Это unblock'ает mixed-size cameras (parking 1920x1080 + gate_lpr 2688x1520 в одном grid с main_plus_preview layout — big cell scaled до 1280x1080, small cells scaled до 640x360). Phase 2 complete (2a + 2b). Phase 3 будет controller sidecar. --- configure | 2 +- libavfilter/vf_cuda_grid.c | 96 +++++++++++++++++++++++++++----------- 2 files changed, 69 insertions(+), 29 deletions(-) diff --git a/configure b/configure index 9c60cb7..cc6a635 100755 --- a/configure +++ b/configure @@ -3317,7 +3317,7 @@ thumbnail_cuda_filter_deps_any="cuda_nvcc cuda_llvm" transpose_npp_filter_deps="ffnvcodec libnpp" overlay_cuda_filter_deps="ffnvcodec" overlay_cuda_filter_deps_any="cuda_nvcc cuda_llvm" -cuda_grid_filter_deps="ffnvcodec" +cuda_grid_filter_deps="ffnvcodec libnpp" sharpen_npp_filter_deps="ffnvcodec libnpp" ddagrab_filter_deps="d3d11va IDXGIOutput1 DXGI_OUTDUPL_FRAME_INFO" diff --git a/libavfilter/vf_cuda_grid.c b/libavfilter/vf_cuda_grid.c index bfc4a53..cfad54b 100644 --- a/libavfilter/vf_cuda_grid.c +++ b/libavfilter/vf_cuda_grid.c @@ -19,6 +19,8 @@ #include "config_components.h" +#include + #include "libavutil/avstring.h" #include "libavutil/common.h" #include "libavutil/cuda_check.h" @@ -213,6 +215,9 @@ static int cuda_grid_compose(FFFrameSync *fs) if (ret < 0) goto fail; + /* NPP в этом thread'е работает в нашем CUDA stream */ + nppSetStream(s->cu_stream); + for (i = 0; i < nb; i++) { AVFrame *src = in[i]; int cx = s->cell_px[i].x; @@ -220,36 +225,71 @@ static int cuda_grid_compose(FFFrameSync *fs) int cw = s->cell_px[i].w; int ch = s->cell_px[i].h; - if (src->width != cw || src->height != ch) { - av_log(ctx, AV_LOG_ERROR, - "input %d size %dx%d != cell size %dx%d " - "(Phase 2a: no scaling — Phase 2b добавит NPP resize)\n", - i, src->width, src->height, cw, ch); - CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); - ret = AVERROR(EINVAL); - goto fail; - } + if (src->width == cw && src->height == ch) { + /* Fast path: same size — memcpy без NPP overhead */ + ret = copy_input_plane(ctx, + (CUdeviceptr)src->data[0], src->linesize[0], + src->width, src->height, + (CUdeviceptr)out->data[0], out->linesize[0], + cx, cy, 1); + if (ret < 0) { + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); + goto fail; + } + ret = copy_input_plane(ctx, + (CUdeviceptr)src->data[1], src->linesize[1], + src->width / 2, src->height / 2, + (CUdeviceptr)out->data[1], out->linesize[1], + cx / 2, cy / 2, 2); + if (ret < 0) { + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); + goto fail; + } + } else { + /* Phase 2b: NPP scaling. Output обязательно chroma-aligned (cell coords + * выровнены до 2 в config_output). */ + NppStatus npp_err; + double xfactor_y = (double)cw / src->width; + double yfactor_y = (double)ch / src->height; + double xfactor_uv = (double)(cw / 2) / (src->width / 2); + double yfactor_uv = (double)(ch / 2) / (src->height / 2); - /* Y plane */ - ret = copy_input_plane(ctx, - (CUdeviceptr)src->data[0], src->linesize[0], - src->width, src->height, - (CUdeviceptr)out->data[0], out->linesize[0], - cx, cy, 1); - if (ret < 0) { - CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); - goto fail; - } + uint8_t *dst_y_ptr = out->data[0] + (size_t)cy * out->linesize[0] + cx; + uint8_t *dst_uv_ptr = out->data[1] + (size_t)(cy / 2) * out->linesize[1] + cx; + /* dst_uv X в bytes = cx (2 bytes per UV-pair × cx/2 = cx bytes) */ - /* UV plane (NV12: половинное разрешение, 2 bytes per "pixel") */ - ret = copy_input_plane(ctx, - (CUdeviceptr)src->data[1], src->linesize[1], - src->width / 2, src->height / 2, - (CUdeviceptr)out->data[1], out->linesize[1], - cx / 2, cy / 2, 2); - if (ret < 0) { - CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); - goto fail; + /* Y plane — 1 channel (luma) */ + npp_err = nppiResizeSqrPixel_8u_C1R( + src->data[0], (NppiSize){src->width, src->height}, + src->linesize[0], (NppiRect){0, 0, src->width, src->height}, + dst_y_ptr, out->linesize[0], + (NppiRect){0, 0, cw, ch}, + xfactor_y, yfactor_y, 0.0, 0.0, + NPPI_INTER_LINEAR); + if (npp_err != NPP_SUCCESS) { + av_log(ctx, AV_LOG_ERROR, + "input %d Y plane NPP resize %dx%d→%dx%d failed: %d\n", + i, src->width, src->height, cw, ch, npp_err); + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); + ret = AVERROR_EXTERNAL; + goto fail; + } + + /* UV plane — 2 channels interleaved (NV12 chroma) */ + npp_err = nppiResizeSqrPixel_8u_C2R( + src->data[1], (NppiSize){src->width / 2, src->height / 2}, + src->linesize[1], (NppiRect){0, 0, src->width / 2, src->height / 2}, + dst_uv_ptr, out->linesize[1], + (NppiRect){0, 0, cw / 2, ch / 2}, + xfactor_uv, yfactor_uv, 0.0, 0.0, + NPPI_INTER_LINEAR); + if (npp_err != NPP_SUCCESS) { + av_log(ctx, AV_LOG_ERROR, + "input %d UV plane NPP resize failed: %d\n", i, npp_err); + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); + ret = AVERROR_EXTERNAL; + goto fail; + } } } -- 2.52.0 From 178fc5bb4eb53a925256ecba5041fff0744e5f3a Mon Sep 17 00:00:00 2001 From: gx Date: Tue, 19 May 2026 21:45:40 +0100 Subject: [PATCH 06/14] =?UTF-8?q?vf=5Fcuda=5Fgrid:=20Phase=202b=20?= =?UTF-8?q?=E2=80=94=20delegated=20scaling=20to=20upstream=20scale=5Fnpp?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit После попытки in-filter NPP scaling обнаружено что nppiResize не имеет _C2R variant для NV12 UV interleaved (только C1R, C3R, C4R). Alternatives: - 2× nppiResize_8u_C1R с split/merge через intermediate buffers - custom CUDA kernel - treat UV pair как 16u (blending artifact на boundaries) Pragmatic decision: cuda_grid делает только composition (same-size memcpy), а scaling делегируется существующему scale_npp filter в filter chain: [0]scale_npp=1280:1080[s0]; [1]scale_npp=640:360[s1]; ... \ [s0][s1]...cuda_grid=layout=main_plus_preview Unix philosophy + leverages production-tested NPP code. Controller (Phase 3) auto-generates filter graph с scale_npp per input. Revert: - #include - libnpp dependency в configure (cuda_grid_filter_deps="ffnvcodec") - nppiResize* calls в compose path Add: - Error message с примером scale_npp chain pattern - Doc в file header c filter graph пример Phase 2 = full deliverable (2a + 2b). Дальше Phase 3 controller. --- configure | 2 +- libavfilter/vf_cuda_grid.c | 116 ++++++++++++++----------------------- 2 files changed, 46 insertions(+), 72 deletions(-) diff --git a/configure b/configure index cc6a635..9c60cb7 100755 --- a/configure +++ b/configure @@ -3317,7 +3317,7 @@ thumbnail_cuda_filter_deps_any="cuda_nvcc cuda_llvm" transpose_npp_filter_deps="ffnvcodec libnpp" overlay_cuda_filter_deps="ffnvcodec" overlay_cuda_filter_deps_any="cuda_nvcc cuda_llvm" -cuda_grid_filter_deps="ffnvcodec libnpp" +cuda_grid_filter_deps="ffnvcodec" sharpen_npp_filter_deps="ffnvcodec libnpp" ddagrab_filter_deps="d3d11va IDXGIOutput1 DXGI_OUTDUPL_FRAME_INFO" diff --git a/libavfilter/vf_cuda_grid.c b/libavfilter/vf_cuda_grid.c index cfad54b..eca57f0 100644 --- a/libavfilter/vf_cuda_grid.c +++ b/libavfilter/vf_cuda_grid.c @@ -4,13 +4,25 @@ * Принимает N CUDA-frames на входе, выдаёт один composed frame с N cells * в layout. End-to-end CUDA (без CPU round-trip). * - * Phase 2a: layout templates (single/dual_h/dual_v/quad/main_plus_preview/ + * Phase 2: layout templates (single/dual_h/dual_v/quad/main_plus_preview/ * six_grid/nine_grid/sixteen_grid/panoramic), dynamic nb_inputs, output size * через option (default 1920×1080). Cell rects = normalized × output size. - * **Scaling пока нет** — каждый input должен быть точно cell size (Phase 2b NPP). + * + * **Scaling delegated to upstream `scale_npp`** filter (Unix philosophy + + * production-tested NPP code). NPP не имеет nppiResize_8u_C2R для NV12 UV + * interleaved, поэтому in-filter scaling = either two intermediate plane + * buffers либо custom CUDA kernel — оба больше work чем filter chain'ить: + * + * ffmpeg ... -filter_complex \ + * "[0]scale_npp=1280:1080[s0]; \ + * [1]scale_npp=640:360[s1]; \ + * [2]scale_npp=640:360[s2]; \ + * [3]scale_npp=640:360[s3]; \ + * [s0][s1][s2][s3]cuda_grid=layout=main_plus_preview[out]" + * + * Controller (Phase 3) auto-generates filter graph с scale_npp per input. * * Future phases (см. gx/vf-cuda-grid#1): - * - Phase 2b: per-cell scaling через libnpp (mixed-size inputs) * - Phase 3: runtime layout switching через process_command (ZMQ) * - Phase 4+: overlay primitives (rect/text/icon/image/dim/graph/chat) * @@ -19,8 +31,6 @@ #include "config_components.h" -#include - #include "libavutil/avstring.h" #include "libavutil/common.h" #include "libavutil/cuda_check.h" @@ -215,9 +225,6 @@ static int cuda_grid_compose(FFFrameSync *fs) if (ret < 0) goto fail; - /* NPP в этом thread'е работает в нашем CUDA stream */ - nppSetStream(s->cu_stream); - for (i = 0; i < nb; i++) { AVFrame *src = in[i]; int cx = s->cell_px[i].x; @@ -225,71 +232,38 @@ static int cuda_grid_compose(FFFrameSync *fs) int cw = s->cell_px[i].w; int ch = s->cell_px[i].h; - if (src->width == cw && src->height == ch) { - /* Fast path: same size — memcpy без NPP overhead */ - ret = copy_input_plane(ctx, - (CUdeviceptr)src->data[0], src->linesize[0], - src->width, src->height, - (CUdeviceptr)out->data[0], out->linesize[0], - cx, cy, 1); - if (ret < 0) { - CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); - goto fail; - } - ret = copy_input_plane(ctx, - (CUdeviceptr)src->data[1], src->linesize[1], - src->width / 2, src->height / 2, - (CUdeviceptr)out->data[1], out->linesize[1], - cx / 2, cy / 2, 2); - if (ret < 0) { - CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); - goto fail; - } - } else { - /* Phase 2b: NPP scaling. Output обязательно chroma-aligned (cell coords - * выровнены до 2 в config_output). */ - NppStatus npp_err; - double xfactor_y = (double)cw / src->width; - double yfactor_y = (double)ch / src->height; - double xfactor_uv = (double)(cw / 2) / (src->width / 2); - double yfactor_uv = (double)(ch / 2) / (src->height / 2); + if (src->width != cw || src->height != ch) { + av_log(ctx, AV_LOG_ERROR, + "input %d size %dx%d != cell size %dx%d. " + "cuda_grid не делает scaling — используй upstream scale_npp:\n" + " [in%d]scale_npp=%d:%d[scaled%d]; [scaled%d]...cuda_grid=layout=%s\n", + i, src->width, src->height, cw, ch, + i, cw, ch, i, i, s->layout->name); + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); + ret = AVERROR(EINVAL); + goto fail; + } - uint8_t *dst_y_ptr = out->data[0] + (size_t)cy * out->linesize[0] + cx; - uint8_t *dst_uv_ptr = out->data[1] + (size_t)(cy / 2) * out->linesize[1] + cx; - /* dst_uv X в bytes = cx (2 bytes per UV-pair × cx/2 = cx bytes) */ + /* Y plane (1 byte per pixel) */ + ret = copy_input_plane(ctx, + (CUdeviceptr)src->data[0], src->linesize[0], + src->width, src->height, + (CUdeviceptr)out->data[0], out->linesize[0], + cx, cy, 1); + if (ret < 0) { + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); + goto fail; + } - /* Y plane — 1 channel (luma) */ - npp_err = nppiResizeSqrPixel_8u_C1R( - src->data[0], (NppiSize){src->width, src->height}, - src->linesize[0], (NppiRect){0, 0, src->width, src->height}, - dst_y_ptr, out->linesize[0], - (NppiRect){0, 0, cw, ch}, - xfactor_y, yfactor_y, 0.0, 0.0, - NPPI_INTER_LINEAR); - if (npp_err != NPP_SUCCESS) { - av_log(ctx, AV_LOG_ERROR, - "input %d Y plane NPP resize %dx%d→%dx%d failed: %d\n", - i, src->width, src->height, cw, ch, npp_err); - CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); - ret = AVERROR_EXTERNAL; - goto fail; - } - - /* UV plane — 2 channels interleaved (NV12 chroma) */ - npp_err = nppiResizeSqrPixel_8u_C2R( - src->data[1], (NppiSize){src->width / 2, src->height / 2}, - src->linesize[1], (NppiRect){0, 0, src->width / 2, src->height / 2}, - dst_uv_ptr, out->linesize[1], - (NppiRect){0, 0, cw / 2, ch / 2}, - xfactor_uv, yfactor_uv, 0.0, 0.0, - NPPI_INTER_LINEAR); - if (npp_err != NPP_SUCCESS) { - av_log(ctx, AV_LOG_ERROR, - "input %d UV plane NPP resize failed: %d\n", i, npp_err); - CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); - ret = AVERROR_EXTERNAL; - goto fail; - } + /* UV plane (NV12: half resolution, 2 bytes per "pixel") */ + ret = copy_input_plane(ctx, + (CUdeviceptr)src->data[1], src->linesize[1], + src->width / 2, src->height / 2, + (CUdeviceptr)out->data[1], out->linesize[1], + cx / 2, cy / 2, 2); + if (ret < 0) { + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); + goto fail; } } -- 2.52.0 From 9deaca769710c88d227316f615e241d73ec2f483 Mon Sep 17 00:00:00 2001 From: gx Date: Tue, 19 May 2026 22:17:41 +0100 Subject: [PATCH 07/14] =?UTF-8?q?vf=5Fcuda=5Fgrid:=20Phase=204b-1=20?= =?UTF-8?q?=E2=80=94=20rect=20overlay=20primitives=20(solid=20fill,=20no?= =?UTF-8?q?=20alpha)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Добавляет inner overlay state с mutex + process_command handler. Rendering filled/border rects через cuMemsetD2D8Async/D2D16Async — без custom kernel'а (Phase 4b-2 = alpha blend, требует .cu). Commands: add_overlay rect cell=N x=.. y=.. w=.. h=.. r=.. g=.. b=.. thickness=.. opacity=.. remove_overlay clear_overlays text/icon/dim — типы определены, render заглушен до Phase 4b-2/3/4. Co-Authored-By: Claude Opus 4.7 --- libavfilter/vf_cuda_grid.c | 402 ++++++++++++++++++++++++++++++++++++- 1 file changed, 391 insertions(+), 11 deletions(-) diff --git a/libavfilter/vf_cuda_grid.c b/libavfilter/vf_cuda_grid.c index eca57f0..9f5cc51 100644 --- a/libavfilter/vf_cuda_grid.c +++ b/libavfilter/vf_cuda_grid.c @@ -40,6 +40,7 @@ #include "libavutil/mem.h" #include "libavutil/opt.h" #include "libavutil/pixdesc.h" +#include "libavutil/thread.h" #include "avfilter.h" #include "filters.h" @@ -48,7 +49,46 @@ #include "video.h" #define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x) -#define MAX_CELLS 16 +#define MAX_CELLS 16 +#define MAX_OVERLAYS 64 +#define OVERLAY_ID_MAX 32 +#define OVERLAY_TEXT_MAX 128 + +/* ─── Overlay primitives (Phase 4b-1: rect only, no alpha) ─────────────── */ + +typedef enum { + OV_TYPE_RECT = 0, + OV_TYPE_TEXT, /* Phase 4b-3 */ + OV_TYPE_ICON, /* Phase 4b-4 */ + OV_TYPE_DIM, /* Phase 4b-2 (alpha kernel) */ +} GridOverlayType; + +typedef struct GridOverlay { + char id[OVERLAY_ID_MAX]; + GridOverlayType type; + int cell; /* -1 = absolute on output, иначе cell-relative */ + float x, y, w, h; /* [0..1] normalized; relative к cell или output */ + int z_order; /* меньше → рисуется первым */ + uint8_t opacity; /* 0..255, реально применяется в Phase 4b-2 */ + int visible; + union { + struct { + uint8_t r, g, b; + int thickness; /* 0 = filled, >0 = border (border via 4 strips) */ + } rect; + struct { + char text[OVERLAY_TEXT_MAX]; + int font_size; + uint8_t r, g, b; + } text; + struct { + char icon_name[32]; + } icon; + struct { + uint8_t amount; /* 0..255, alpha for dim */ + } dim; + } u; +} GridOverlay; /* ─── Layout templates (normalized координаты 0.0–1.0) ─────────────────── */ @@ -157,6 +197,12 @@ typedef struct CudaGridContext { struct { int x, y, w, h; } cell_px[MAX_CELLS]; + + /* Overlay state (Phase 4b) — mutex-guarded, process_command/render thread-safe */ + GridOverlay overlays[MAX_OVERLAYS]; + int nb_overlays; + pthread_mutex_t overlay_lock; + int overlay_lock_inited; } CudaGridContext; /* ─── Composition: copy одного input plane в target region output ──────── */ @@ -188,6 +234,272 @@ static int copy_input_plane(AVFilterContext *ctx, return CHECK_CU(s->hwctx->internal->cuda_dl->cuMemcpy2DAsync(&cpy, s->cu_stream)); } +/* ─── Overlay parsing + rendering (Phase 4b-1: rect, solid fill, no alpha) ─ */ + +/* BT.709 limited-range RGB → YUV. Достаточно для HDTV (любое 1920×1080 output). */ +static av_always_inline void rgb_to_yuv709(uint8_t r, uint8_t g, uint8_t b, + uint8_t *Y, uint8_t *U, uint8_t *V) +{ + int y = (int)( 0.183f * r + 0.614f * g + 0.062f * b) + 16; + int u = (int)(-0.101f * r - 0.339f * g + 0.439f * b) + 128; + int v = (int)( 0.439f * r - 0.399f * g - 0.040f * b) + 128; + *Y = av_clip_uint8(y); + *U = av_clip_uint8(u); + *V = av_clip_uint8(v); +} + +/* Parse args в формате: " = = ..." + * Out *ov заполняется defaults + parsed values. Returns 0 / AVERROR. */ +static int parse_overlay_args(AVFilterContext *ctx, const char *args, GridOverlay *ov) +{ + char id[OVERLAY_ID_MAX], type_str[16]; + const char *p; + + if (!args) + return AVERROR(EINVAL); + + if (sscanf(args, "%31s %15s", id, type_str) != 2) { + av_log(ctx, AV_LOG_ERROR, "overlay args: expected ' ...', got: %s\n", args); + return AVERROR(EINVAL); + } + + memset(ov, 0, sizeof(*ov)); + av_strlcpy(ov->id, id, sizeof(ov->id)); + + if (!strcmp(type_str, "rect")) ov->type = OV_TYPE_RECT; + else if (!strcmp(type_str, "text")) ov->type = OV_TYPE_TEXT; + else if (!strcmp(type_str, "icon")) ov->type = OV_TYPE_ICON; + else if (!strcmp(type_str, "dim")) ov->type = OV_TYPE_DIM; + else { + av_log(ctx, AV_LOG_ERROR, "unknown overlay type: %s\n", type_str); + return AVERROR(EINVAL); + } + + /* Defaults */ + ov->cell = -1; + ov->opacity = 255; + ov->visible = 1; + if (ov->type == OV_TYPE_RECT) + ov->u.rect.thickness = 0; /* filled */ + if (ov->type == OV_TYPE_DIM) + ov->u.dim.amount = 128; + + /* Advance past и */ + p = strchr(args, ' '); + if (!p) return 0; + while (*p == ' ') p++; + p = strchr(p, ' '); + if (!p) return 0; + while (*p == ' ') p++; + + while (*p) { + char key[32], val[OVERLAY_TEXT_MAX]; + int n = 0; + if (sscanf(p, "%31[^= \t]=%127s%n", key, val, &n) < 2) + break; + p += n; + while (*p == ' ' || *p == '\t') p++; + + if (!strcmp(key, "cell")) ov->cell = atoi(val); + else if (!strcmp(key, "x")) ov->x = (float)atof(val); + else if (!strcmp(key, "y")) ov->y = (float)atof(val); + else if (!strcmp(key, "w")) ov->w = (float)atof(val); + else if (!strcmp(key, "h")) ov->h = (float)atof(val); + else if (!strcmp(key, "z_order") || !strcmp(key, "z")) + ov->z_order = atoi(val); + else if (!strcmp(key, "opacity")) ov->opacity = av_clip(atoi(val), 0, 255); + else if (!strcmp(key, "visible")) ov->visible = atoi(val) ? 1 : 0; + else if (ov->type == OV_TYPE_RECT) { + if (!strcmp(key, "r")) ov->u.rect.r = av_clip_uint8(atoi(val)); + else if (!strcmp(key, "g")) ov->u.rect.g = av_clip_uint8(atoi(val)); + else if (!strcmp(key, "b")) ov->u.rect.b = av_clip_uint8(atoi(val)); + else if (!strcmp(key, "thickness")) ov->u.rect.thickness = atoi(val); + } else if (ov->type == OV_TYPE_TEXT) { + if (!strcmp(key, "text")) av_strlcpy(ov->u.text.text, val, sizeof(ov->u.text.text)); + else if (!strcmp(key, "font_size")) ov->u.text.font_size = atoi(val); + else if (!strcmp(key, "r")) ov->u.text.r = av_clip_uint8(atoi(val)); + else if (!strcmp(key, "g")) ov->u.text.g = av_clip_uint8(atoi(val)); + else if (!strcmp(key, "b")) ov->u.text.b = av_clip_uint8(atoi(val)); + } else if (ov->type == OV_TYPE_ICON) { + if (!strcmp(key, "icon_name")) av_strlcpy(ov->u.icon.icon_name, val, sizeof(ov->u.icon.icon_name)); + } else if (ov->type == OV_TYPE_DIM) { + if (!strcmp(key, "amount")) ov->u.dim.amount = av_clip_uint8(atoi(val)); + } + } + return 0; +} + +/* Lock must be held by caller. */ +static int overlay_upsert_locked(CudaGridContext *s, const GridOverlay *ov) +{ + int i; + for (i = 0; i < s->nb_overlays; i++) { + if (!strcmp(s->overlays[i].id, ov->id)) { + s->overlays[i] = *ov; + return 0; + } + } + if (s->nb_overlays >= MAX_OVERLAYS) + return AVERROR(ENOSPC); + s->overlays[s->nb_overlays++] = *ov; + return 0; +} + +static int overlay_remove_locked(CudaGridContext *s, const char *id) +{ + int i; + for (i = 0; i < s->nb_overlays; i++) { + if (!strcmp(s->overlays[i].id, id)) { + memmove(&s->overlays[i], &s->overlays[i + 1], + (s->nb_overlays - i - 1) * sizeof(GridOverlay)); + s->nb_overlays--; + return 0; + } + } + return AVERROR(ENOENT); +} + +/* Compute pixel rect для overlay: либо cell-relative, либо absolute. + * Clips против output bounds + 2px alignment для NV12 chroma. */ +static int overlay_pixel_rect(CudaGridContext *s, const GridOverlay *ov, + int *out_x, int *out_y, int *out_w, int *out_h) +{ + int rx, ry, rw, rh; + int base_x, base_y, base_w, base_h; + + if (ov->cell < 0) { + base_x = 0; + base_y = 0; + base_w = s->out_width; + base_h = s->out_height; + } else if (ov->cell < s->layout->nb_cells) { + base_x = s->cell_px[ov->cell].x; + base_y = s->cell_px[ov->cell].y; + base_w = s->cell_px[ov->cell].w; + base_h = s->cell_px[ov->cell].h; + } else { + return AVERROR(EINVAL); + } + + rx = base_x + (int)(ov->x * base_w); + ry = base_y + (int)(ov->y * base_h); + rw = (int)(ov->w * base_w); + rh = (int)(ov->h * base_h); + + if (rx < 0) { rw += rx; rx = 0; } + if (ry < 0) { rh += ry; ry = 0; } + if (rx + rw > s->out_width) rw = s->out_width - rx; + if (ry + rh > s->out_height) rh = s->out_height - ry; + rx &= ~1; ry &= ~1; rw &= ~1; rh &= ~1; + + *out_x = rx; *out_y = ry; *out_w = rw; *out_h = rh; + return (rw > 0 && rh > 0) ? 0 : 1; /* 1 → empty, skip */ +} + +/* Solid filled strip (no alpha). cu ctx must be pushed. */ +static int render_strip_solid(AVFilterContext *ctx, AVFrame *out, + int x, int y, int w, int h, + uint8_t Y, uint8_t U, uint8_t V) +{ + CudaGridContext *s = ctx->priv; + CUdeviceptr dst_y, dst_uv; + unsigned short uv; + int ret; + + x &= ~1; y &= ~1; w &= ~1; h &= ~1; + if (w <= 0 || h <= 0) return 0; + + dst_y = (CUdeviceptr)(out->data[0] + (size_t)y * out->linesize[0] + x); + ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuMemsetD2D8Async( + dst_y, out->linesize[0], Y, (size_t)w, (size_t)h, s->cu_stream)); + if (ret < 0) return ret; + + /* NV12 UV interleaved, half res. cuMemsetD2D16Async sets 16-bit element + * = UV pair. Little-endian: low byte=U, high byte=V. */ + uv = (unsigned short)U | ((unsigned short)V << 8); + dst_uv = (CUdeviceptr)(out->data[1] + (size_t)(y / 2) * out->linesize[1] + x); + ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuMemsetD2D16Async( + dst_uv, out->linesize[1], uv, (size_t)(w / 2), (size_t)(h / 2), s->cu_stream)); + return ret; +} + +static int render_overlay_rect(AVFilterContext *ctx, AVFrame *out, const GridOverlay *ov) +{ + CudaGridContext *s = ctx->priv; + int px, py, pw, ph; + uint8_t Y, U, V; + int ret; + + ret = overlay_pixel_rect(s, ov, &px, &py, &pw, &ph); + if (ret != 0) return ret < 0 ? ret : 0; + + rgb_to_yuv709(ov->u.rect.r, ov->u.rect.g, ov->u.rect.b, &Y, &U, &V); + + if (ov->u.rect.thickness <= 0) { + /* Filled */ + return render_strip_solid(ctx, out, px, py, pw, ph, Y, U, V); + } else { + int t = ov->u.rect.thickness; + t = FFMIN(t, FFMIN(pw / 2, ph / 2)); + if (t < 2) t = 2; + t &= ~1; + /* Top */ + ret = render_strip_solid(ctx, out, px, py, pw, t, Y, U, V); + if (ret < 0) return ret; + /* Bottom */ + ret = render_strip_solid(ctx, out, px, py + ph - t, pw, t, Y, U, V); + if (ret < 0) return ret; + /* Left */ + ret = render_strip_solid(ctx, out, px, py + t, t, ph - 2 * t, Y, U, V); + if (ret < 0) return ret; + /* Right */ + ret = render_strip_solid(ctx, out, px + pw - t, py + t, t, ph - 2 * t, Y, U, V); + return ret; + } +} + +/* Render all visible overlays. cu ctx must be pushed by caller. */ +static int render_overlays(AVFilterContext *ctx, AVFrame *out) +{ + CudaGridContext *s = ctx->priv; + GridOverlay sorted[MAX_OVERLAYS]; + int i, j, n, ret; + + pthread_mutex_lock(&s->overlay_lock); + n = s->nb_overlays; + memcpy(sorted, s->overlays, (size_t)n * sizeof(GridOverlay)); + pthread_mutex_unlock(&s->overlay_lock); + + /* Insertion sort by z_order (stable). n ≤ 64 → fine. */ + for (i = 1; i < n; i++) { + GridOverlay tmp = sorted[i]; + for (j = i; j > 0 && sorted[j - 1].z_order > tmp.z_order; j--) + sorted[j] = sorted[j - 1]; + sorted[j] = tmp; + } + + for (i = 0; i < n; i++) { + const GridOverlay *ov = &sorted[i]; + if (!ov->visible) continue; + switch (ov->type) { + case OV_TYPE_RECT: + ret = render_overlay_rect(ctx, out, ov); + if (ret < 0) return ret; + break; + case OV_TYPE_DIM: + av_log(ctx, AV_LOG_TRACE, "overlay %s: dim type — Phase 4b-2 (alpha kernel)\n", ov->id); + break; + case OV_TYPE_TEXT: + av_log(ctx, AV_LOG_TRACE, "overlay %s: text — Phase 4b-3 (freetype)\n", ov->id); + break; + case OV_TYPE_ICON: + av_log(ctx, AV_LOG_TRACE, "overlay %s: icon — Phase 4b-4 (sprite)\n", ov->id); + break; + } + } + return 0; +} + /* ─── Framesync callback ──────────────────────────────────────────────── */ static int cuda_grid_compose(FFFrameSync *fs) @@ -267,6 +579,13 @@ static int cuda_grid_compose(FFFrameSync *fs) } } + /* Overlay pass (Phase 4b) */ + ret = render_overlays(ctx, out); + if (ret < 0) { + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); + goto fail; + } + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); return ff_filter_frame(outlink, out); @@ -305,7 +624,15 @@ static av_cold int cuda_grid_init(AVFilterContext *ctx) return ret; } - av_log(ctx, AV_LOG_INFO, "cuda_grid layout=%s cells=%d output=%dx%d\n", + /* Overlay mutex */ + ret = pthread_mutex_init(&s->overlay_lock, NULL); + if (ret) { + av_log(ctx, AV_LOG_ERROR, "overlay_lock init failed: %d\n", ret); + return AVERROR(ENOMEM); + } + s->overlay_lock_inited = 1; + + av_log(ctx, AV_LOG_INFO, "cuda_grid layout=%s cells=%d output=%dx%d (overlays: rect)\n", lt->name, lt->nb_cells, s->out_width, s->out_height); return 0; } @@ -314,6 +641,58 @@ static av_cold void cuda_grid_uninit(AVFilterContext *ctx) { CudaGridContext *s = ctx->priv; ff_framesync_uninit(&s->fs); + if (s->overlay_lock_inited) { + pthread_mutex_destroy(&s->overlay_lock); + s->overlay_lock_inited = 0; + } +} + +static int cuda_grid_process_command(AVFilterContext *ctx, const char *cmd, + const char *arg, char *res, int res_len, int flags) +{ + CudaGridContext *s = ctx->priv; + int ret; + + if (!strcmp(cmd, "add_overlay")) { + GridOverlay ov; + ret = parse_overlay_args(ctx, arg, &ov); + if (ret < 0) { + if (res) av_strlcpy(res, "err parse", res_len); + return ret; + } + pthread_mutex_lock(&s->overlay_lock); + ret = overlay_upsert_locked(s, &ov); + pthread_mutex_unlock(&s->overlay_lock); + if (res) snprintf(res, res_len, ret == 0 ? "ok id=%s n=%d" : "err full id=%s", + ov.id, s->nb_overlays); + return ret; + } + + if (!strcmp(cmd, "remove_overlay")) { + char id[OVERLAY_ID_MAX]; + if (!arg || sscanf(arg, "%31s", id) != 1) { + if (res) av_strlcpy(res, "err parse", res_len); + return AVERROR(EINVAL); + } + pthread_mutex_lock(&s->overlay_lock); + ret = overlay_remove_locked(s, id); + pthread_mutex_unlock(&s->overlay_lock); + if (res) snprintf(res, res_len, ret == 0 ? "ok id=%s" : "not_found id=%s", id); + return ret; + } + + if (!strcmp(cmd, "clear_overlays")) { + pthread_mutex_lock(&s->overlay_lock); + s->nb_overlays = 0; + pthread_mutex_unlock(&s->overlay_lock); + if (res) av_strlcpy(res, "ok", res_len); + return 0; + } + + /* Future: set_layout (Phase 3 на filter side — пока nb_inputs хардкодится в init) */ + + /* Fall back to standard option/command handling */ + return ff_filter_process_command(ctx, cmd, arg, res, res_len, flags); } static int cuda_grid_config_input(AVFilterLink *inlink) @@ -465,16 +844,17 @@ static const AVFilterPad cuda_grid_outputs[] = { }; const AVFilter ff_vf_cuda_grid = { - .name = "cuda_grid", - .description = NULL_IF_CONFIG_SMALL("GPU-native video grid composer (CUDA)."), - .priv_class = &cuda_grid_class, - .priv_size = sizeof(CudaGridContext), - .init = cuda_grid_init, - .uninit = cuda_grid_uninit, - .activate = cuda_grid_activate, + .name = "cuda_grid", + .description = NULL_IF_CONFIG_SMALL("GPU-native video grid composer (CUDA)."), + .priv_class = &cuda_grid_class, + .priv_size = sizeof(CudaGridContext), + .init = cuda_grid_init, + .uninit = cuda_grid_uninit, + .activate = cuda_grid_activate, + .process_command = cuda_grid_process_command, /* No FILTER_INPUTS — pads added dynamically в init() per layout. */ FILTER_OUTPUTS(cuda_grid_outputs), FILTER_SINGLE_PIXFMT(AV_PIX_FMT_CUDA), - .flags = AVFILTER_FLAG_HWDEVICE | AVFILTER_FLAG_DYNAMIC_INPUTS, - .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE, + .flags = AVFILTER_FLAG_HWDEVICE | AVFILTER_FLAG_DYNAMIC_INPUTS, + .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE, }; -- 2.52.0 From 8ca590004b48ba11b5c80900374031d0bf625246 Mon Sep 17 00:00:00 2001 From: gx Date: Tue, 19 May 2026 22:24:01 +0100 Subject: [PATCH 08/14] =?UTF-8?q?vf=5Fcuda=5Fgrid:=20Phase=204b-2=20?= =?UTF-8?q?=E2=80=94=20CUDA=20kernels=20(alpha-blended=20overlays)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Custom .cu kernels компилируются через ffbuild → .ptx, embedded в binary через bin2c pattern (как vf_scale_cuda). Loading через ff_cuda_load_module в config_output, kernel handles cached на life-of-filter. Kernels: Alpha_Fill_Y/UV — solid colour α-blend (rect, dim, border strips) Alpha_Blit_RGBA_Y/UV — blit RGBA atlas → NV12 (text/icon в 4b-3/4) Render side: render_strip_alpha — заменил render_strip_solid (alpha=255 ≡ solid) render_overlay_rect — opacity передаётся в kernel render_overlay_dim — Y=16/UV=neutral × dim.amount (затемнение region) Также: убран unused cuda_grid_config_input (-Werror), добавлены fields CUmodule + 4×CUfunction в Context; unload в uninit. cuMemsetD2D* НЕ доступны в FFmpeg's CudaFunctions wrapper, поэтому solid fill реализован через kernel — лишний overhead minimal (16×16 threads). Co-Authored-By: Claude Opus 4.7 --- libavfilter/Makefile | 3 +- libavfilter/vf_cuda_grid.c | 143 +++++++++++++++++++++++++++--------- libavfilter/vf_cuda_grid.cu | 121 ++++++++++++++++++++++++++++++ 3 files changed, 230 insertions(+), 37 deletions(-) create mode 100644 libavfilter/vf_cuda_grid.cu diff --git a/libavfilter/Makefile b/libavfilter/Makefile index 8015a49..f570821 100644 --- a/libavfilter/Makefile +++ b/libavfilter/Makefile @@ -410,7 +410,8 @@ OBJS-$(CONFIG_OSCILLOSCOPE_FILTER) += vf_datascope.o OBJS-$(CONFIG_OVERLAY_FILTER) += vf_overlay.o framesync.o OBJS-$(CONFIG_OVERLAY_CUDA_FILTER) += vf_overlay_cuda.o framesync.o vf_overlay_cuda.ptx.o \ cuda/load_helper.o -OBJS-$(CONFIG_CUDA_GRID_FILTER) += vf_cuda_grid.o framesync.o +OBJS-$(CONFIG_CUDA_GRID_FILTER) += vf_cuda_grid.o framesync.o \ + vf_cuda_grid.ptx.o cuda/load_helper.o OBJS-$(CONFIG_OVERLAY_OPENCL_FILTER) += vf_overlay_opencl.o opencl.o \ opencl/overlay.o framesync.o OBJS-$(CONFIG_OVERLAY_QSV_FILTER) += vf_overlay_qsv.o framesync.o diff --git a/libavfilter/vf_cuda_grid.c b/libavfilter/vf_cuda_grid.c index 9f5cc51..675fb15 100644 --- a/libavfilter/vf_cuda_grid.c +++ b/libavfilter/vf_cuda_grid.c @@ -43,11 +43,19 @@ #include "libavutil/thread.h" #include "avfilter.h" +#include "cuda/load_helper.h" #include "filters.h" #include "formats.h" #include "framesync.h" #include "video.h" +extern const unsigned char ff_vf_cuda_grid_ptx_data[]; +extern const unsigned int ff_vf_cuda_grid_ptx_len; + +#define BLOCKX 16 +#define BLOCKY 16 +#define DIV_UP(a, b) (((a) + (b) - 1) / (b)) + #define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x) #define MAX_CELLS 16 #define MAX_OVERLAYS 64 @@ -203,6 +211,13 @@ typedef struct CudaGridContext { int nb_overlays; pthread_mutex_t overlay_lock; int overlay_lock_inited; + + /* CUDA kernels (Phase 4b-2) — loaded from embedded .ptx */ + CUmodule cu_module; + CUfunction cu_func_alpha_fill_y; + CUfunction cu_func_alpha_fill_uv; + CUfunction cu_func_alpha_blit_rgba_y; + CUfunction cu_func_alpha_blit_rgba_uv; } CudaGridContext; /* ─── Composition: copy одного input plane в target region output ──────── */ @@ -396,31 +411,49 @@ static int overlay_pixel_rect(CudaGridContext *s, const GridOverlay *ov, return (rw > 0 && rh > 0) ? 0 : 1; /* 1 → empty, skip */ } -/* Solid filled strip (no alpha). cu ctx must be pushed. */ -static int render_strip_solid(AVFilterContext *ctx, AVFrame *out, +/* Strip α-blend (opacity 0..255). cu ctx must be pushed. + * opacity=255 → solid fill (kernel uses single-pass blend formula). */ +static int render_strip_alpha(AVFilterContext *ctx, AVFrame *out, int x, int y, int w, int h, - uint8_t Y, uint8_t U, uint8_t V) + uint8_t Y, uint8_t U, uint8_t V, uint8_t alpha) { CudaGridContext *s = ctx->priv; CUdeviceptr dst_y, dst_uv; - unsigned short uv; int ret; + int iY = Y, iU = U, iV = V, iA = alpha; + int rx, ry, rw, rh; x &= ~1; y &= ~1; w &= ~1; h &= ~1; - if (w <= 0 || h <= 0) return 0; + if (w <= 0 || h <= 0 || alpha == 0) return 0; - dst_y = (CUdeviceptr)(out->data[0] + (size_t)y * out->linesize[0] + x); - ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuMemsetD2D8Async( - dst_y, out->linesize[0], Y, (size_t)w, (size_t)h, s->cu_stream)); - if (ret < 0) return ret; + /* Y plane */ + dst_y = (CUdeviceptr)out->data[0]; + rx = x; ry = y; rw = w; rh = h; + { + int dst_pitch_y = out->linesize[0]; + void *args[] = { &dst_y, &dst_pitch_y, &rx, &ry, &rw, &rh, &iY, &iA }; + ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuLaunchKernel( + s->cu_func_alpha_fill_y, + DIV_UP(w, BLOCKX), DIV_UP(h, BLOCKY), 1, + BLOCKX, BLOCKY, 1, + 0, s->cu_stream, args, NULL)); + if (ret < 0) return ret; + } - /* NV12 UV interleaved, half res. cuMemsetD2D16Async sets 16-bit element - * = UV pair. Little-endian: low byte=U, high byte=V. */ - uv = (unsigned short)U | ((unsigned short)V << 8); - dst_uv = (CUdeviceptr)(out->data[1] + (size_t)(y / 2) * out->linesize[1] + x); - ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuMemsetD2D16Async( - dst_uv, out->linesize[1], uv, (size_t)(w / 2), (size_t)(h / 2), s->cu_stream)); - return ret; + /* UV plane (half res, interleaved 2 bytes per chroma sample) */ + dst_uv = (CUdeviceptr)out->data[1]; + rx = x / 2; ry = y / 2; rw = w / 2; rh = h / 2; + { + int dst_pitch_uv = out->linesize[1]; + void *args[] = { &dst_uv, &dst_pitch_uv, &rx, &ry, &rw, &rh, &iU, &iV, &iA }; + ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuLaunchKernel( + s->cu_func_alpha_fill_uv, + DIV_UP(w / 2, BLOCKX), DIV_UP(h / 2, BLOCKY), 1, + BLOCKX, BLOCKY, 1, + 0, s->cu_stream, args, NULL)); + if (ret < 0) return ret; + } + return 0; } static int render_overlay_rect(AVFilterContext *ctx, AVFrame *out, const GridOverlay *ov) @@ -436,28 +469,42 @@ static int render_overlay_rect(AVFilterContext *ctx, AVFrame *out, const GridOve rgb_to_yuv709(ov->u.rect.r, ov->u.rect.g, ov->u.rect.b, &Y, &U, &V); if (ov->u.rect.thickness <= 0) { - /* Filled */ - return render_strip_solid(ctx, out, px, py, pw, ph, Y, U, V); + /* Filled rect */ + return render_strip_alpha(ctx, out, px, py, pw, ph, Y, U, V, ov->opacity); } else { int t = ov->u.rect.thickness; t = FFMIN(t, FFMIN(pw / 2, ph / 2)); if (t < 2) t = 2; t &= ~1; /* Top */ - ret = render_strip_solid(ctx, out, px, py, pw, t, Y, U, V); + ret = render_strip_alpha(ctx, out, px, py, pw, t, Y, U, V, ov->opacity); if (ret < 0) return ret; /* Bottom */ - ret = render_strip_solid(ctx, out, px, py + ph - t, pw, t, Y, U, V); + ret = render_strip_alpha(ctx, out, px, py + ph - t, pw, t, Y, U, V, ov->opacity); if (ret < 0) return ret; /* Left */ - ret = render_strip_solid(ctx, out, px, py + t, t, ph - 2 * t, Y, U, V); + ret = render_strip_alpha(ctx, out, px, py + t, t, ph - 2 * t, Y, U, V, ov->opacity); if (ret < 0) return ret; /* Right */ - ret = render_strip_solid(ctx, out, px + pw - t, py + t, t, ph - 2 * t, Y, U, V); + ret = render_strip_alpha(ctx, out, px + pw - t, py + t, t, ph - 2 * t, Y, U, V, ov->opacity); return ret; } } +/* Dim overlay: рамка тёмная (Y=16, neutral UV) полупрозрачная по cell. */ +static int render_overlay_dim(AVFilterContext *ctx, AVFrame *out, const GridOverlay *ov) +{ + CudaGridContext *s = ctx->priv; + int px, py, pw, ph; + int ret; + + ret = overlay_pixel_rect(s, ov, &px, &py, &pw, &ph); + if (ret != 0) return ret < 0 ? ret : 0; + + /* Y=16 = darkest legal Y (BT.709 limited range), UV=128 = neutral chroma */ + return render_strip_alpha(ctx, out, px, py, pw, ph, 16, 128, 128, ov->u.dim.amount); +} + /* Render all visible overlays. cu ctx must be pushed by caller. */ static int render_overlays(AVFilterContext *ctx, AVFrame *out) { @@ -487,7 +534,8 @@ static int render_overlays(AVFilterContext *ctx, AVFrame *out) if (ret < 0) return ret; break; case OV_TYPE_DIM: - av_log(ctx, AV_LOG_TRACE, "overlay %s: dim type — Phase 4b-2 (alpha kernel)\n", ov->id); + ret = render_overlay_dim(ctx, out, ov); + if (ret < 0) return ret; break; case OV_TYPE_TEXT: av_log(ctx, AV_LOG_TRACE, "overlay %s: text — Phase 4b-3 (freetype)\n", ov->id); @@ -641,6 +689,13 @@ static av_cold void cuda_grid_uninit(AVFilterContext *ctx) { CudaGridContext *s = ctx->priv; ff_framesync_uninit(&s->fs); + if (s->cu_module && s->hwctx) { + CUcontext dummy; + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPushCurrent(s->cu_ctx)); + CHECK_CU(s->hwctx->internal->cuda_dl->cuModuleUnload(s->cu_module)); + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); + s->cu_module = NULL; + } if (s->overlay_lock_inited) { pthread_mutex_destroy(&s->overlay_lock); s->overlay_lock_inited = 0; @@ -695,19 +750,6 @@ static int cuda_grid_process_command(AVFilterContext *ctx, const char *cmd, return ff_filter_process_command(ctx, cmd, arg, res, res_len, flags); } -static int cuda_grid_config_input(AVFilterLink *inlink) -{ - AVFilterContext *ctx = inlink->src; - FilterLink *inl = ff_filter_link(inlink); - - if (!inl->hw_frames_ctx || !inl->hw_frames_ctx->data) { - av_log(ctx, AV_LOG_ERROR, "input %s: software pixel format не поддерживается\n", - inlink->dstpad->name); - return AVERROR(EINVAL); - } - return 0; -} - static int cuda_grid_config_output(AVFilterLink *outlink) { AVFilterContext *ctx = outlink->src; @@ -773,6 +815,35 @@ static int cuda_grid_config_output(AVFilterLink *outlink) s->cu_ctx = s->hwctx->cuda_ctx; s->cu_stream = s->hwctx->stream; + /* Load CUDA module + resolve kernel handles (Phase 4b-2 alpha kernels) */ + { + CUcontext dummy; + ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPushCurrent(s->cu_ctx)); + if (ret < 0) return ret; + + ret = ff_cuda_load_module(ctx, s->hwctx, &s->cu_module, + ff_vf_cuda_grid_ptx_data, ff_vf_cuda_grid_ptx_len); + if (ret < 0) { + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); + return ret; + } + +#define GET_FN(handle, name) \ + ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuModuleGetFunction( \ + &s->handle, s->cu_module, name)); \ + if (ret < 0) { \ + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); \ + return ret; \ + } + GET_FN(cu_func_alpha_fill_y, "Alpha_Fill_Y"); + GET_FN(cu_func_alpha_fill_uv, "Alpha_Fill_UV"); + GET_FN(cu_func_alpha_blit_rgba_y, "Alpha_Blit_RGBA_Y"); + GET_FN(cu_func_alpha_blit_rgba_uv, "Alpha_Blit_RGBA_UV"); +#undef GET_FN + + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); + } + out_ref = av_hwframe_ctx_alloc(hwfc0->device_ref); if (!out_ref) return AVERROR(ENOMEM); diff --git a/libavfilter/vf_cuda_grid.cu b/libavfilter/vf_cuda_grid.cu new file mode 100644 index 0000000..10d2af6 --- /dev/null +++ b/libavfilter/vf_cuda_grid.cu @@ -0,0 +1,121 @@ +/* + * cuda_grid overlay CUDA kernels. + * + * Алфа-блендинг poверх NV12 frame: + * - Alpha_Fill_Y / Alpha_Fill_UV: solid colour fill region (rect/dim) + * - Alpha_Blit_RGBA_Y / Alpha_Blit_RGBA_UV: blit RGBA atlas → NV12 + * (для text Phase 4b-3 и icon Phase 4b-4) + * + * BT.709 limited-range conversion (HDTV). См. также vf_cuda_grid.c rgb_to_yuv709. + * + * Лицензия: LGPL-2.1+ (соответствует FFmpeg) + */ + +#include "cuda/vector_helpers.cuh" + +extern "C" { + +/* Solid colour α-blend на Y plane. + * dst: pointer на Y plane base + * dst_pitch: bytes per row + * rx, ry, rw, rh: region rect (pixels, must be in-bounds) + * fill: 0..255 (Y component to blend in) + * alpha: 0..255 (255 = fully opaque) + */ +__global__ void Alpha_Fill_Y(unsigned char *dst, int dst_pitch, + int rx, int ry, int rw, int rh, + int fill, int alpha) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= rw || y >= rh) return; + unsigned char *p = dst + (ry + y) * dst_pitch + (rx + x); + int cur = *p; + *p = (unsigned char)((fill * alpha + cur * (255 - alpha)) / 255); +} + +/* Solid colour α-blend на UV plane (NV12 interleaved). + * rx, ry: in chroma plane coords (= full-res Y x/2, y/2) + * rw, rh: also в chroma coords + */ +__global__ void Alpha_Fill_UV(unsigned char *dst, int dst_pitch, + int rx, int ry, int rw, int rh, + int fill_u, int fill_v, int alpha) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= rw || y >= rh) return; + unsigned char *p = dst + (ry + y) * dst_pitch + (rx + x) * 2; + int cu = p[0], cv = p[1]; + p[0] = (unsigned char)((fill_u * alpha + cu * (255 - alpha)) / 255); + p[1] = (unsigned char)((fill_v * alpha + cv * (255 - alpha)) / 255); +} + +/* Blit RGBA atlas → Y plane с α-blending. + * dst, dst_pitch: Y plane + * dx, dy: destination pixel offset (full-res coords) + * atlas, atlas_pitch: RGBA source (interleaved R,G,B,A bytes), pitch в bytes + * w, h: atlas dimensions (in pixels) + * extra_alpha: дополнительный множитель (0..255) — overlay-level opacity + */ +__global__ void Alpha_Blit_RGBA_Y(unsigned char *dst, int dst_pitch, + int dx, int dy, + const unsigned char *atlas, int atlas_pitch, + int w, int h, int extra_alpha) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= w || y >= h) return; + const unsigned char *sp = atlas + y * atlas_pitch + x * 4; + int r = sp[0], g = sp[1], b = sp[2], a = sp[3]; + a = a * extra_alpha / 255; + if (a == 0) return; + int Y = (int)(0.183f * r + 0.614f * g + 0.062f * b) + 16; + Y = Y < 0 ? 0 : (Y > 255 ? 255 : Y); + unsigned char *p = dst + (dy + y) * dst_pitch + (dx + x); + int cur = *p; + *p = (unsigned char)((Y * a + cur * (255 - a)) / 255); +} + +/* Blit RGBA atlas → UV plane с 4:2:0 chroma subsampling. + * dst, dst_pitch: UV plane + * dx, dy: destination pixel offset (full-res coords; обе должны быть кратны 2) + * atlas, atlas_pitch: RGBA source + * w, h: atlas dimensions (full-res; UV operates на w/2 × h/2) + * extra_alpha: 0..255 + */ +__global__ void Alpha_Blit_RGBA_UV(unsigned char *dst, int dst_pitch, + int dx, int dy, + const unsigned char *atlas, int atlas_pitch, + int w, int h, int extra_alpha) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + int hw = w / 2, hh = h / 2; + if (x >= hw || y >= hh) return; + + int sx = x * 2, sy = y * 2; + const unsigned char *row0 = atlas + sy * atlas_pitch; + const unsigned char *row1 = atlas + (sy + 1) * atlas_pitch; + + int r = (row0[sx*4+0] + row0[(sx+1)*4+0] + row1[sx*4+0] + row1[(sx+1)*4+0]) >> 2; + int g = (row0[sx*4+1] + row0[(sx+1)*4+1] + row1[sx*4+1] + row1[(sx+1)*4+1]) >> 2; + int b = (row0[sx*4+2] + row0[(sx+1)*4+2] + row1[sx*4+2] + row1[(sx+1)*4+2]) >> 2; + int a = (row0[sx*4+3] + row0[(sx+1)*4+3] + row1[sx*4+3] + row1[(sx+1)*4+3]) >> 2; + a = a * extra_alpha / 255; + if (a == 0) return; + + int U = (int)(-0.101f * r - 0.339f * g + 0.439f * b) + 128; + int V = (int)( 0.439f * r - 0.399f * g - 0.040f * b) + 128; + U = U < 0 ? 0 : (U > 255 ? 255 : U); + V = V < 0 ? 0 : (V > 255 ? 255 : V); + + int du = dx / 2 + x; + int dv = dy / 2 + y; + unsigned char *p = dst + dv * dst_pitch + du * 2; + int cu = p[0], cv = p[1]; + p[0] = (unsigned char)((U * a + cu * (255 - a)) / 255); + p[1] = (unsigned char)((V * a + cv * (255 - a)) / 255); +} + +} /* extern "C" */ -- 2.52.0 From 1e54f04e247143d8bf5312dc17292e6f591909e7 Mon Sep 17 00:00:00 2001 From: gx Date: Tue, 19 May 2026 22:26:12 +0100 Subject: [PATCH 09/14] =?UTF-8?q?configure:=20add=20cuda=5Fgrid=5Ffilter?= =?UTF-8?q?=5Fdeps=5Fany=20=D0=B4=D0=BB=D1=8F=20cuda=5Fnvcc/cuda=5Fllvm?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Без этого filter compileд но .ptx step failед: configure не knows что filter требует nvcc/clang для .cu compile. Same pattern как scale_cuda. Co-Authored-By: Claude Opus 4.7 --- configure | 1 + 1 file changed, 1 insertion(+) diff --git a/configure b/configure index 9c60cb7..b94029e 100755 --- a/configure +++ b/configure @@ -3318,6 +3318,7 @@ transpose_npp_filter_deps="ffnvcodec libnpp" overlay_cuda_filter_deps="ffnvcodec" overlay_cuda_filter_deps_any="cuda_nvcc cuda_llvm" cuda_grid_filter_deps="ffnvcodec" +cuda_grid_filter_deps_any="cuda_nvcc cuda_llvm" sharpen_npp_filter_deps="ffnvcodec libnpp" ddagrab_filter_deps="d3d11va IDXGIOutput1 DXGI_OUTDUPL_FRAME_INFO" -- 2.52.0 From 40104613009c0482fa9d9bfd4910245053a6ec68 Mon Sep 17 00:00:00 2001 From: gx Date: Tue, 19 May 2026 22:30:36 +0100 Subject: [PATCH 10/14] =?UTF-8?q?vf=5Fcuda=5Fgrid:=20Phase=204b-3=20?= =?UTF-8?q?=E2=80=94=20text=20overlay=20(freetype=20+=20RGBA=20atlas)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CPU rasterization через freetype (DejaVu/Liberation auto-detect или font_file= option), upload pitched RGBA buffer to GPU, blit через Alpha_Blit_RGBA_Y/UV kernels (Phase 4b-2 уже had). Cache: per-overlay-id atlas с keys (text, font_size, r/g/b) — re-rasterize только при change. Cleanup при remove_overlay/clear_overlays/uninit. Options: font_file= — TTF path (default: search DejaVu/Liberation) font_size= — default size if overlay не указал свой Wire format: add_overlay text x=.. y=.. text=hello font_size=24 r=255 g=255 b=255 opacity=200 Conditional на CONFIG_LIBFREETYPE — без него text overlays no-op (остальные типы работают как обычно). Co-Authored-By: Claude Opus 4.7 --- libavfilter/vf_cuda_grid.c | 417 ++++++++++++++++++++++++++++++++++++- 1 file changed, 413 insertions(+), 4 deletions(-) diff --git a/libavfilter/vf_cuda_grid.c b/libavfilter/vf_cuda_grid.c index 675fb15..11735e1 100644 --- a/libavfilter/vf_cuda_grid.c +++ b/libavfilter/vf_cuda_grid.c @@ -31,6 +31,8 @@ #include "config_components.h" +#include + #include "libavutil/avstring.h" #include "libavutil/common.h" #include "libavutil/cuda_check.h" @@ -42,6 +44,12 @@ #include "libavutil/pixdesc.h" #include "libavutil/thread.h" +#include "config.h" +#if CONFIG_LIBFREETYPE +#include +#include FT_FREETYPE_H +#endif + #include "avfilter.h" #include "cuda/load_helper.h" #include "filters.h" @@ -183,6 +191,17 @@ static const LayoutTemplate *find_layout(const char *name) /* ─── Filter state ─────────────────────────────────────────────────────── */ +/* Text atlas — rasterized RGBA buffer on GPU, keyed by overlay id (Phase 4b-3). */ +typedef struct TextAtlas { + char id[OVERLAY_ID_MAX]; + char cached_text[OVERLAY_TEXT_MAX]; + int cached_font_size; + uint8_t cached_r, cached_g, cached_b; + CUdeviceptr device_ptr; + size_t device_pitch; + int w, h; +} TextAtlas; + typedef struct CudaGridContext { const AVClass *class; @@ -190,6 +209,8 @@ typedef struct CudaGridContext { char *layout_name; int out_width; int out_height; + char *font_file; + int default_font_size; /* Resolved layout (после init) */ const LayoutTemplate *layout; @@ -218,6 +239,16 @@ typedef struct CudaGridContext { CUfunction cu_func_alpha_fill_uv; CUfunction cu_func_alpha_blit_rgba_y; CUfunction cu_func_alpha_blit_rgba_uv; + + /* Text rendering (Phase 4b-3, libfreetype optional) */ +#if CONFIG_LIBFREETYPE + FT_Library ft_lib; + FT_Face ft_face; + int ft_ready; /* 1 once library + face loaded */ + int ft_init_attempted; /* set on first attempt, success or fail */ +#endif + TextAtlas atlases[MAX_OVERLAYS]; + int nb_atlases; } CudaGridContext; /* ─── Composition: copy одного input plane в target region output ──────── */ @@ -456,6 +487,344 @@ static int render_strip_alpha(AVFilterContext *ctx, AVFrame *out, return 0; } +/* ─── Text rendering (Phase 4b-3, freetype) ────────────────────────────── */ + +#if CONFIG_LIBFREETYPE + +static const char *const default_fonts[] = { + "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", + "/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf", + "/usr/share/fonts/TTF/DejaVuSans-Bold.ttf", + "/usr/share/fonts/dejavu/DejaVuSans-Bold.ttf", + NULL, +}; + +static int ensure_ft_loaded(AVFilterContext *ctx) +{ + CudaGridContext *s = ctx->priv; + const char *path = s->font_file; + FT_Error err; + int i; + + if (s->ft_ready) return 0; + if (s->ft_init_attempted) return AVERROR(ENOSYS); + s->ft_init_attempted = 1; + + err = FT_Init_FreeType(&s->ft_lib); + if (err) { + av_log(ctx, AV_LOG_WARNING, "FT_Init_FreeType failed (err=%d), text overlays disabled\n", err); + return AVERROR(ENOSYS); + } + + if (!path) { + for (i = 0; default_fonts[i]; i++) { + if (access(default_fonts[i], R_OK) == 0) { + path = default_fonts[i]; + break; + } + } + } + if (!path) { + av_log(ctx, AV_LOG_WARNING, "no font found (set font_file= option), text disabled\n"); + FT_Done_FreeType(s->ft_lib); + s->ft_lib = NULL; + return AVERROR(ENOENT); + } + + err = FT_New_Face(s->ft_lib, path, 0, &s->ft_face); + if (err) { + av_log(ctx, AV_LOG_WARNING, "FT_New_Face(%s) failed: %d, text disabled\n", path, err); + FT_Done_FreeType(s->ft_lib); + s->ft_lib = NULL; + return AVERROR(EIO); + } + + av_log(ctx, AV_LOG_INFO, "freetype: loaded %s\n", path); + s->ft_ready = 1; + return 0; +} + +/* Two-pass rasterize: 1) measure total_w / ascent / descent, 2) blit to RGBA buffer. + * Returns malloc'd RGBA buffer (caller frees) + dims via out params, or NULL. */ +static uint8_t *rasterize_text_rgba(AVFilterContext *ctx, const char *text, int font_size, + uint8_t r, uint8_t g, uint8_t b, + int *out_w, int *out_h) +{ + CudaGridContext *s = ctx->priv; + FT_Error err; + const unsigned char *p; + int total_w = 0, ascent_px = 0, descent_px = 0; + int atlas_w, atlas_h, pen_x, baseline_y; + uint8_t *atlas; + + if (!s->ft_ready) return NULL; + err = FT_Set_Pixel_Sizes(s->ft_face, 0, font_size); + if (err) { + av_log(ctx, AV_LOG_WARNING, "FT_Set_Pixel_Sizes failed: %d\n", err); + return NULL; + } + + /* Pass 1: measure */ + for (p = (const unsigned char *)text; *p; p++) { + FT_UInt gi; + FT_GlyphSlot g_slot; + gi = FT_Get_Char_Index(s->ft_face, *p); + if (!gi) continue; + if (FT_Load_Glyph(s->ft_face, gi, FT_LOAD_DEFAULT)) continue; + g_slot = s->ft_face->glyph; + total_w += (g_slot->advance.x >> 6); + ascent_px = FFMAX(ascent_px, g_slot->bitmap_top); + descent_px = FFMAX(descent_px, (int)g_slot->bitmap.rows - g_slot->bitmap_top); + } + if (total_w <= 0) return NULL; + + atlas_w = total_w + 4; /* padding */ + atlas_h = ascent_px + descent_px + 4; + if (atlas_h <= 0) atlas_h = font_size; + + atlas = av_mallocz((size_t)atlas_w * atlas_h * 4); + if (!atlas) return NULL; + + /* Pass 2: blit */ + pen_x = 2; + baseline_y = ascent_px + 2; + for (p = (const unsigned char *)text; *p; p++) { + FT_UInt gi; + FT_GlyphSlot g_slot; + int gx, gy, bw, bh; + int yy, xx; + + gi = FT_Get_Char_Index(s->ft_face, *p); + if (!gi) continue; + if (FT_Load_Glyph(s->ft_face, gi, FT_LOAD_RENDER)) continue; + g_slot = s->ft_face->glyph; + if (g_slot->bitmap.pixel_mode != FT_PIXEL_MODE_GRAY) { + pen_x += g_slot->advance.x >> 6; + continue; + } + bw = g_slot->bitmap.width; + bh = g_slot->bitmap.rows; + gx = pen_x + g_slot->bitmap_left; + gy = baseline_y - g_slot->bitmap_top; + + for (yy = 0; yy < bh; yy++) { + int dy = gy + yy; + if (dy < 0 || dy >= atlas_h) continue; + for (xx = 0; xx < bw; xx++) { + int dx = gx + xx; + uint8_t gray; + uint8_t *dst; + if (dx < 0 || dx >= atlas_w) continue; + gray = g_slot->bitmap.buffer[yy * g_slot->bitmap.pitch + xx]; + if (gray == 0) continue; + dst = atlas + ((size_t)dy * atlas_w + dx) * 4; + /* Premultiplied: max() — text "fills" rather than blending overlaps */ + if (gray > dst[3]) { + dst[0] = r; + dst[1] = g; + dst[2] = b; + dst[3] = gray; + } + } + } + pen_x += g_slot->advance.x >> 6; + } + + *out_w = atlas_w; + *out_h = atlas_h; + return atlas; +} + +#endif /* CONFIG_LIBFREETYPE */ + +/* Cache lookup — locked by caller. */ +static TextAtlas *atlas_find_locked(CudaGridContext *s, const char *id) +{ + int i; + for (i = 0; i < s->nb_atlases; i++) + if (!strcmp(s->atlases[i].id, id)) + return &s->atlases[i]; + return NULL; +} + +static void atlas_free_gpu_locked(CudaGridContext *s, TextAtlas *a) +{ + if (a->device_ptr) { + s->hwctx->internal->cuda_dl->cuMemFree(a->device_ptr); + a->device_ptr = 0; + } +} + +static void atlas_remove_locked(CudaGridContext *s, const char *id) +{ + int i; + for (i = 0; i < s->nb_atlases; i++) { + if (!strcmp(s->atlases[i].id, id)) { + atlas_free_gpu_locked(s, &s->atlases[i]); + memmove(&s->atlases[i], &s->atlases[i + 1], + (s->nb_atlases - i - 1) * sizeof(TextAtlas)); + s->nb_atlases--; + return; + } + } +} + +#if CONFIG_LIBFREETYPE +/* Ensure GPU atlas for given overlay matches its text/font_size/color. cu ctx pushed by caller. */ +static int ensure_text_atlas(AVFilterContext *ctx, const GridOverlay *ov, TextAtlas **out_atlas) +{ + CudaGridContext *s = ctx->priv; + TextAtlas *a; + uint8_t *cpu_atlas; + int aw, ah; + int font_size; + CUdeviceptr dev_ptr = 0; + size_t dev_pitch = 0; + CUDA_MEMCPY2D cpy = { 0 }; + int ret; + + font_size = ov->u.text.font_size > 0 ? ov->u.text.font_size : s->default_font_size; + + a = atlas_find_locked(s, ov->id); + if (a && + !strcmp(a->cached_text, ov->u.text.text) && + a->cached_font_size == font_size && + a->cached_r == ov->u.text.r && + a->cached_g == ov->u.text.g && + a->cached_b == ov->u.text.b) { + *out_atlas = a; + return 0; + } + + if (s->nb_atlases >= MAX_OVERLAYS && !a) + return AVERROR(ENOSPC); + + ret = ensure_ft_loaded(ctx); + if (ret < 0) return ret; + + cpu_atlas = rasterize_text_rgba(ctx, ov->u.text.text, font_size, + ov->u.text.r, ov->u.text.g, ov->u.text.b, + &aw, &ah); + if (!cpu_atlas) return AVERROR(EINVAL); + + /* Allocate pitched device buffer + upload */ + ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuMemAllocPitch( + &dev_ptr, &dev_pitch, (size_t)aw * 4, (size_t)ah, 4)); + if (ret < 0) { av_free(cpu_atlas); return ret; } + + cpy.srcMemoryType = CU_MEMORYTYPE_HOST; + cpy.srcHost = cpu_atlas; + cpy.srcPitch = (size_t)aw * 4; + cpy.dstMemoryType = CU_MEMORYTYPE_DEVICE; + cpy.dstDevice = dev_ptr; + cpy.dstPitch = dev_pitch; + cpy.WidthInBytes = (size_t)aw * 4; + cpy.Height = (size_t)ah; + ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuMemcpy2D(&cpy)); + av_free(cpu_atlas); + if (ret < 0) { + s->hwctx->internal->cuda_dl->cuMemFree(dev_ptr); + return ret; + } + + if (a) { + atlas_free_gpu_locked(s, a); + } else { + a = &s->atlases[s->nb_atlases++]; + memset(a, 0, sizeof(*a)); + av_strlcpy(a->id, ov->id, sizeof(a->id)); + } + av_strlcpy(a->cached_text, ov->u.text.text, sizeof(a->cached_text)); + a->cached_font_size = font_size; + a->cached_r = ov->u.text.r; + a->cached_g = ov->u.text.g; + a->cached_b = ov->u.text.b; + a->device_ptr = dev_ptr; + a->device_pitch = dev_pitch; + a->w = aw; + a->h = ah; + *out_atlas = a; + return 0; +} + +static int render_overlay_text(AVFilterContext *ctx, AVFrame *out, const GridOverlay *ov) +{ + CudaGridContext *s = ctx->priv; + TextAtlas *a = NULL; + int px, py; + int base_x, base_y, base_w, base_h; + CUdeviceptr dst_y, dst_uv; + int ret; + int aw, ah, ap, ealpha; + + if (!ov->u.text.text[0]) return 0; + ret = ensure_text_atlas(ctx, ov, &a); + if (ret < 0) { + av_log(ctx, AV_LOG_WARNING, "text overlay %s: atlas failed (ret=%d)\n", ov->id, ret); + return 0; /* non-fatal — skip */ + } + if (!a) return 0; + + /* Position: x,y normalized within cell or output (top-left corner of atlas). */ + if (ov->cell < 0) { + base_x = 0; base_y = 0; + base_w = s->out_width; base_h = s->out_height; + } else if (ov->cell < s->layout->nb_cells) { + base_x = s->cell_px[ov->cell].x; + base_y = s->cell_px[ov->cell].y; + base_w = s->cell_px[ov->cell].w; + base_h = s->cell_px[ov->cell].h; + } else { + return 0; + } + px = base_x + (int)(ov->x * base_w); + py = base_y + (int)(ov->y * base_h); + px &= ~1; py &= ~1; + /* Clip — if atlas runs past output bounds, kernels still safe (they bounds-check + * against atlas w/h, not dst). But avoid going off output: simple skip when fully outside. */ + if (px >= s->out_width || py >= s->out_height) return 0; + + aw = a->w; ah = a->h; + /* Reduce ah if it overflows output (avoid OOB on Y plane); UV plane uses h/2 so same scale. */ + if (px + aw > s->out_width) aw = s->out_width - px; + if (py + ah > s->out_height) ah = s->out_height - py; + aw &= ~1; ah &= ~1; + if (aw <= 0 || ah <= 0) return 0; + + ap = (int)a->device_pitch; + ealpha = ov->opacity; + + /* Y plane */ + dst_y = (CUdeviceptr)out->data[0]; + { + int dst_pitch_y = out->linesize[0]; + CUdeviceptr atl = a->device_ptr; + void *args[] = { &dst_y, &dst_pitch_y, &px, &py, &atl, &ap, &aw, &ah, &ealpha }; + ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuLaunchKernel( + s->cu_func_alpha_blit_rgba_y, + DIV_UP(aw, BLOCKX), DIV_UP(ah, BLOCKY), 1, + BLOCKX, BLOCKY, 1, + 0, s->cu_stream, args, NULL)); + if (ret < 0) return ret; + } + /* UV plane (kernel does internal half-res subsampling) */ + dst_uv = (CUdeviceptr)out->data[1]; + { + int dst_pitch_uv = out->linesize[1]; + CUdeviceptr atl = a->device_ptr; + void *args[] = { &dst_uv, &dst_pitch_uv, &px, &py, &atl, &ap, &aw, &ah, &ealpha }; + ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuLaunchKernel( + s->cu_func_alpha_blit_rgba_uv, + DIV_UP(aw / 2, BLOCKX), DIV_UP(ah / 2, BLOCKY), 1, + BLOCKX, BLOCKY, 1, + 0, s->cu_stream, args, NULL)); + if (ret < 0) return ret; + } + return 0; +} + +#endif /* CONFIG_LIBFREETYPE */ + static int render_overlay_rect(AVFilterContext *ctx, AVFrame *out, const GridOverlay *ov) { CudaGridContext *s = ctx->priv; @@ -538,7 +907,12 @@ static int render_overlays(AVFilterContext *ctx, AVFrame *out) if (ret < 0) return ret; break; case OV_TYPE_TEXT: - av_log(ctx, AV_LOG_TRACE, "overlay %s: text — Phase 4b-3 (freetype)\n", ov->id); +#if CONFIG_LIBFREETYPE + ret = render_overlay_text(ctx, out, ov); + if (ret < 0) return ret; +#else + av_log(ctx, AV_LOG_TRACE, "overlay %s: text — libfreetype disabled at build\n", ov->id); +#endif break; case OV_TYPE_ICON: av_log(ctx, AV_LOG_TRACE, "overlay %s: icon — Phase 4b-4 (sprite)\n", ov->id); @@ -688,14 +1062,29 @@ static av_cold int cuda_grid_init(AVFilterContext *ctx) static av_cold void cuda_grid_uninit(AVFilterContext *ctx) { CudaGridContext *s = ctx->priv; + int i; ff_framesync_uninit(&s->fs); - if (s->cu_module && s->hwctx) { + + if (s->hwctx) { CUcontext dummy; CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPushCurrent(s->cu_ctx)); - CHECK_CU(s->hwctx->internal->cuda_dl->cuModuleUnload(s->cu_module)); + for (i = 0; i < s->nb_atlases; i++) { + if (s->atlases[i].device_ptr) { + s->hwctx->internal->cuda_dl->cuMemFree(s->atlases[i].device_ptr); + s->atlases[i].device_ptr = 0; + } + } + s->nb_atlases = 0; + if (s->cu_module) { + CHECK_CU(s->hwctx->internal->cuda_dl->cuModuleUnload(s->cu_module)); + s->cu_module = NULL; + } CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); - s->cu_module = NULL; } +#if CONFIG_LIBFREETYPE + if (s->ft_face) { FT_Done_Face(s->ft_face); s->ft_face = NULL; } + if (s->ft_lib) { FT_Done_FreeType(s->ft_lib); s->ft_lib = NULL; } +#endif if (s->overlay_lock_inited) { pthread_mutex_destroy(&s->overlay_lock); s->overlay_lock_inited = 0; @@ -731,14 +1120,30 @@ static int cuda_grid_process_command(AVFilterContext *ctx, const char *cmd, } pthread_mutex_lock(&s->overlay_lock); ret = overlay_remove_locked(s, id); + /* Best-effort atlas GPU cleanup: cu ctx may not be pushed here, push it. */ + if (ret == 0 && s->hwctx) { + CUcontext dummy; + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPushCurrent(s->cu_ctx)); + atlas_remove_locked(s, id); + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); + } pthread_mutex_unlock(&s->overlay_lock); if (res) snprintf(res, res_len, ret == 0 ? "ok id=%s" : "not_found id=%s", id); return ret; } if (!strcmp(cmd, "clear_overlays")) { + int i; pthread_mutex_lock(&s->overlay_lock); s->nb_overlays = 0; + if (s->hwctx) { + CUcontext dummy; + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPushCurrent(s->cu_ctx)); + for (i = 0; i < s->nb_atlases; i++) + atlas_free_gpu_locked(s, &s->atlases[i]); + s->nb_atlases = 0; + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); + } pthread_mutex_unlock(&s->overlay_lock); if (res) av_strlcpy(res, "ok", res_len); return 0; @@ -899,6 +1304,10 @@ static const AVOption cuda_grid_options[] = { OFFSET(out_width), AV_OPT_TYPE_INT, { .i64 = 1920 }, 16, 16384, FLAGS }, { "out_h", "высота output frame в пикселях", OFFSET(out_height), AV_OPT_TYPE_INT, { .i64 = 1080 }, 16, 16384, FLAGS }, + { "font_file", "TTF/OTF font path (default: search DejaVu/Liberation)", + OFFSET(font_file), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, FLAGS }, + { "font_size", "default text overlay size (px), используется если overlay не указал свой", + OFFSET(default_font_size), AV_OPT_TYPE_INT, { .i64 = 24 }, 6, 256, FLAGS }, { NULL } }; -- 2.52.0 From b88f966f835803d79fee14c783ce7d7024fb600a Mon Sep 17 00:00:00 2001 From: gx Date: Tue, 19 May 2026 22:32:38 +0100 Subject: [PATCH 11/14] =?UTF-8?q?vf=5Fcuda=5Fgrid:=20URL-decode=20=D0=B4?= =?UTF-8?q?=D0=BB=D1=8F=20string=20overlay=20fields=20(text,=20icon=5Fname?= =?UTF-8?q?)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Controller URL-encode'ит string values (text="hello world" → text=hello%20world) чтобы пройти sscanf("%s") который stops on whitespace. Filter inline decode'ит '%xx' obratno в bytes. Только для text + icon_name. Co-Authored-By: Claude Opus 4.7 --- libavfilter/vf_cuda_grid.c | 36 ++++++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/libavfilter/vf_cuda_grid.c b/libavfilter/vf_cuda_grid.c index 11735e1..1a45eb1 100644 --- a/libavfilter/vf_cuda_grid.c +++ b/libavfilter/vf_cuda_grid.c @@ -294,7 +294,33 @@ static av_always_inline void rgb_to_yuv709(uint8_t r, uint8_t g, uint8_t b, *V = av_clip_uint8(v); } +/* In-place URL-decode (%20 etc.) — for string values приходящих от controller. */ +static void url_decode_inplace(char *s) +{ + char *r = s, *w = s; + while (*r) { + if (r[0] == '%' && r[1] && r[2]) { + int hi = -1, lo = -1; + char c1 = r[1], c2 = r[2]; + if (c1 >= '0' && c1 <= '9') hi = c1 - '0'; + else if (c1 >= 'A' && c1 <= 'F') hi = c1 - 'A' + 10; + else if (c1 >= 'a' && c1 <= 'f') hi = c1 - 'a' + 10; + if (c2 >= '0' && c2 <= '9') lo = c2 - '0'; + else if (c2 >= 'A' && c2 <= 'F') lo = c2 - 'A' + 10; + else if (c2 >= 'a' && c2 <= 'f') lo = c2 - 'a' + 10; + if (hi >= 0 && lo >= 0) { + *w++ = (char)((hi << 4) | lo); + r += 3; + continue; + } + } + *w++ = *r++; + } + *w = '\0'; +} + /* Parse args в формате: " = = ..." + * String values приходят URL-encoded (%20 = space) от controller. * Out *ov заполняется defaults + parsed values. Returns 0 / AVERROR. */ static int parse_overlay_args(AVFilterContext *ctx, const char *args, GridOverlay *ov) { @@ -361,13 +387,19 @@ static int parse_overlay_args(AVFilterContext *ctx, const char *args, GridOverla else if (!strcmp(key, "b")) ov->u.rect.b = av_clip_uint8(atoi(val)); else if (!strcmp(key, "thickness")) ov->u.rect.thickness = atoi(val); } else if (ov->type == OV_TYPE_TEXT) { - if (!strcmp(key, "text")) av_strlcpy(ov->u.text.text, val, sizeof(ov->u.text.text)); + if (!strcmp(key, "text")) { + av_strlcpy(ov->u.text.text, val, sizeof(ov->u.text.text)); + url_decode_inplace(ov->u.text.text); + } else if (!strcmp(key, "font_size")) ov->u.text.font_size = atoi(val); else if (!strcmp(key, "r")) ov->u.text.r = av_clip_uint8(atoi(val)); else if (!strcmp(key, "g")) ov->u.text.g = av_clip_uint8(atoi(val)); else if (!strcmp(key, "b")) ov->u.text.b = av_clip_uint8(atoi(val)); } else if (ov->type == OV_TYPE_ICON) { - if (!strcmp(key, "icon_name")) av_strlcpy(ov->u.icon.icon_name, val, sizeof(ov->u.icon.icon_name)); + if (!strcmp(key, "icon_name")) { + av_strlcpy(ov->u.icon.icon_name, val, sizeof(ov->u.icon.icon_name)); + url_decode_inplace(ov->u.icon.icon_name); + } } else if (ov->type == OV_TYPE_DIM) { if (!strcmp(key, "amount")) ov->u.dim.amount = av_clip_uint8(atoi(val)); } -- 2.52.0 From c5130cb15ceb7e82febd0b549af0006264733182 Mon Sep 17 00:00:00 2001 From: gx Date: Tue, 19 May 2026 22:41:11 +0100 Subject: [PATCH 12/14] =?UTF-8?q?vf=5Fcuda=5Fgrid:=20Phase=204b-4=20?= =?UTF-8?q?=E2=80=94=20icon=20overlay=20(PNG/JPG=20decode=20+=20sprite=20b?= =?UTF-8?q?lit)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PNG/JPG decode через libavcodec (системный decoder), conversion в RGBA8 через swscale, upload в pitched CUDA buffer, blit через Alpha_Blit_RGBA_Y/UV (те же kernels что text). Icon cache — shared by name (одна и та же иконка для нескольких overlays = один GPU atlas). Lifecycle: load on first use, free в uninit. Path resolution: / + try .png/.jpg extensions: /var/lib/cuda-grid/icons (default) /opt/cuda-grid/icons (fallback) + abs path supported Options: icon_dir= — overrides default search paths Wire format: add_overlay icon x=.. y=.. icon_name=domofon opacity=200 Co-Authored-By: Claude Opus 4.7 --- libavfilter/vf_cuda_grid.c | 273 ++++++++++++++++++++++++++++++++++++- 1 file changed, 272 insertions(+), 1 deletion(-) diff --git a/libavfilter/vf_cuda_grid.c b/libavfilter/vf_cuda_grid.c index 1a45eb1..32c4922 100644 --- a/libavfilter/vf_cuda_grid.c +++ b/libavfilter/vf_cuda_grid.c @@ -50,6 +50,10 @@ #include FT_FREETYPE_H #endif +#include "libavcodec/avcodec.h" +#include "libavformat/avformat.h" +#include "libswscale/swscale.h" + #include "avfilter.h" #include "cuda/load_helper.h" #include "filters.h" @@ -191,6 +195,17 @@ static const LayoutTemplate *find_layout(const char *name) /* ─── Filter state ─────────────────────────────────────────────────────── */ +/* Icon atlas — RGBA buffer на GPU, keyed by icon_name (Phase 4b-4, shared across overlays). */ +#define OVERLAY_ICON_NAME_MAX 32 +#define MAX_ICON_ATLASES 32 + +typedef struct IconAtlas { + char icon_name[OVERLAY_ICON_NAME_MAX]; + CUdeviceptr device_ptr; + size_t device_pitch; + int w, h; +} IconAtlas; + /* Text atlas — rasterized RGBA buffer on GPU, keyed by overlay id (Phase 4b-3). */ typedef struct TextAtlas { char id[OVERLAY_ID_MAX]; @@ -211,6 +226,7 @@ typedef struct CudaGridContext { int out_height; char *font_file; int default_font_size; + char *icon_dir; /* Resolved layout (после init) */ const LayoutTemplate *layout; @@ -249,6 +265,10 @@ typedef struct CudaGridContext { #endif TextAtlas atlases[MAX_OVERLAYS]; int nb_atlases; + + /* Icon atlas cache (Phase 4b-4) */ + IconAtlas icon_atlases[MAX_ICON_ATLASES]; + int nb_icon_atlases; } CudaGridContext; /* ─── Composition: copy одного input plane в target region output ──────── */ @@ -857,6 +877,247 @@ static int render_overlay_text(AVFilterContext *ctx, AVFrame *out, const GridOve #endif /* CONFIG_LIBFREETYPE */ +/* ─── Icon rendering (Phase 4b-4, PNG via libavcodec + swscale) ────────── */ + +static const char *const default_icon_dirs[] = { + "/var/lib/cuda-grid/icons", + "/opt/cuda-grid/icons", + NULL, +}; + +/* Read image file, decode (PNG/JPEG/etc.), convert to packed RGBA8. + * Returns malloc'd buffer (caller frees) + dims, or NULL on error. */ +static uint8_t *load_image_to_rgba_cpu(AVFilterContext *ctx, const char *path, + int *out_w, int *out_h) +{ + AVFormatContext *fmt_ctx = NULL; + AVCodecContext *cctx = NULL; + AVFrame *frame = NULL; + AVPacket *pkt = NULL; + struct SwsContext *sws = NULL; + const AVCodec *codec; + int stream_idx; + uint8_t *rgba = NULL; + int rgba_pitch; + int ret; + + if (avformat_open_input(&fmt_ctx, path, NULL, NULL) < 0) { + av_log(ctx, AV_LOG_WARNING, "icon: open %s failed\n", path); + return NULL; + } + if (avformat_find_stream_info(fmt_ctx, NULL) < 0) goto fail; + + stream_idx = av_find_best_stream(fmt_ctx, AVMEDIA_TYPE_VIDEO, -1, -1, NULL, 0); + if (stream_idx < 0) goto fail; + + codec = avcodec_find_decoder(fmt_ctx->streams[stream_idx]->codecpar->codec_id); + if (!codec) goto fail; + + cctx = avcodec_alloc_context3(codec); + if (!cctx) goto fail; + if (avcodec_parameters_to_context(cctx, fmt_ctx->streams[stream_idx]->codecpar) < 0) goto fail; + if (avcodec_open2(cctx, codec, NULL) < 0) goto fail; + + pkt = av_packet_alloc(); + frame = av_frame_alloc(); + if (!pkt || !frame) goto fail; + + if (av_read_frame(fmt_ctx, pkt) < 0) goto fail; + if (avcodec_send_packet(cctx, pkt) < 0) goto fail; + ret = avcodec_receive_frame(cctx, frame); + if (ret < 0) goto fail; + + sws = sws_getContext(frame->width, frame->height, frame->format, + frame->width, frame->height, AV_PIX_FMT_RGBA, + SWS_BILINEAR, NULL, NULL, NULL); + if (!sws) goto fail; + + rgba_pitch = frame->width * 4; + rgba = av_malloc((size_t)rgba_pitch * frame->height); + if (!rgba) goto fail; + + { + uint8_t *out_data[4] = { rgba, NULL, NULL, NULL }; + int out_pitch[4] = { rgba_pitch, 0, 0, 0 }; + sws_scale(sws, (const uint8_t * const *)frame->data, frame->linesize, + 0, frame->height, out_data, out_pitch); + } + + *out_w = frame->width; + *out_h = frame->height; + +fail: + if (sws) sws_freeContext(sws); + if (frame) av_frame_free(&frame); + if (pkt) av_packet_free(&pkt); + if (cctx) avcodec_free_context(&cctx); + if (fmt_ctx) avformat_close_input(&fmt_ctx); + return rgba; +} + +/* Resolve icon name к full path (search dirs + extensions). */ +static int resolve_icon_path(AVFilterContext *ctx, const char *icon_name, + char *out_path, size_t out_path_sz) +{ + CudaGridContext *s = ctx->priv; + static const char *const exts[] = { "", ".png", ".jpg", NULL }; + const char *dirs[8] = { 0 }; + int i, j, ndir = 0; + + /* Если absolute path — use as-is */ + if (icon_name[0] == '/') { + if (access(icon_name, R_OK) == 0) { + av_strlcpy(out_path, icon_name, out_path_sz); + return 0; + } + return AVERROR(ENOENT); + } + + if (s->icon_dir) dirs[ndir++] = s->icon_dir; + for (i = 0; default_icon_dirs[i] && ndir < 7; i++) dirs[ndir++] = default_icon_dirs[i]; + + for (i = 0; i < ndir; i++) { + for (j = 0; exts[j]; j++) { + snprintf(out_path, out_path_sz, "%s/%s%s", dirs[i], icon_name, exts[j]); + if (access(out_path, R_OK) == 0) return 0; + } + } + return AVERROR(ENOENT); +} + +static IconAtlas *icon_atlas_find_locked(CudaGridContext *s, const char *name) +{ + int i; + for (i = 0; i < s->nb_icon_atlases; i++) + if (!strcmp(s->icon_atlases[i].icon_name, name)) + return &s->icon_atlases[i]; + return NULL; +} + +/* Ensure GPU atlas for icon_name loaded. cu ctx pushed by caller. */ +static int ensure_icon_atlas(AVFilterContext *ctx, const char *icon_name, IconAtlas **out) +{ + CudaGridContext *s = ctx->priv; + IconAtlas *a; + uint8_t *cpu_rgba; + char path[512]; + int aw, ah; + CUdeviceptr dev_ptr = 0; + size_t dev_pitch = 0; + CUDA_MEMCPY2D cpy = { 0 }; + int ret; + + a = icon_atlas_find_locked(s, icon_name); + if (a && a->device_ptr) { *out = a; return 0; } + if (s->nb_icon_atlases >= MAX_ICON_ATLASES && !a) return AVERROR(ENOSPC); + + ret = resolve_icon_path(ctx, icon_name, path, sizeof(path)); + if (ret < 0) { + av_log(ctx, AV_LOG_WARNING, "icon '%s' not found in search paths\n", icon_name); + return ret; + } + + cpu_rgba = load_image_to_rgba_cpu(ctx, path, &aw, &ah); + if (!cpu_rgba) return AVERROR(EINVAL); + + ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuMemAllocPitch( + &dev_ptr, &dev_pitch, (size_t)aw * 4, (size_t)ah, 4)); + if (ret < 0) { av_free(cpu_rgba); return ret; } + + cpy.srcMemoryType = CU_MEMORYTYPE_HOST; + cpy.srcHost = cpu_rgba; + cpy.srcPitch = (size_t)aw * 4; + cpy.dstMemoryType = CU_MEMORYTYPE_DEVICE; + cpy.dstDevice = dev_ptr; + cpy.dstPitch = dev_pitch; + cpy.WidthInBytes = (size_t)aw * 4; + cpy.Height = (size_t)ah; + ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuMemcpy2D(&cpy)); + av_free(cpu_rgba); + if (ret < 0) { + s->hwctx->internal->cuda_dl->cuMemFree(dev_ptr); + return ret; + } + + if (!a) { + a = &s->icon_atlases[s->nb_icon_atlases++]; + memset(a, 0, sizeof(*a)); + av_strlcpy(a->icon_name, icon_name, sizeof(a->icon_name)); + } + a->device_ptr = dev_ptr; + a->device_pitch = dev_pitch; + a->w = aw; + a->h = ah; + *out = a; + av_log(ctx, AV_LOG_INFO, "icon '%s' loaded %s (%dx%d)\n", icon_name, path, aw, ah); + return 0; +} + +static int render_overlay_icon(AVFilterContext *ctx, AVFrame *out, const GridOverlay *ov) +{ + CudaGridContext *s = ctx->priv; + IconAtlas *a = NULL; + int px, py; + int base_x, base_y, base_w, base_h; + int aw, ah, ap, ealpha; + CUdeviceptr dst_y, dst_uv; + int ret; + + if (!ov->u.icon.icon_name[0]) return 0; + ret = ensure_icon_atlas(ctx, ov->u.icon.icon_name, &a); + if (ret < 0 || !a) return 0; /* non-fatal — skip */ + + if (ov->cell < 0) { + base_x = 0; base_y = 0; + base_w = s->out_width; base_h = s->out_height; + } else if (ov->cell < s->layout->nb_cells) { + base_x = s->cell_px[ov->cell].x; + base_y = s->cell_px[ov->cell].y; + base_w = s->cell_px[ov->cell].w; + base_h = s->cell_px[ov->cell].h; + } else return 0; + + px = base_x + (int)(ov->x * base_w); + py = base_y + (int)(ov->y * base_h); + px &= ~1; py &= ~1; + if (px >= s->out_width || py >= s->out_height) return 0; + + aw = a->w; ah = a->h; + if (px + aw > s->out_width) aw = s->out_width - px; + if (py + ah > s->out_height) ah = s->out_height - py; + aw &= ~1; ah &= ~1; + if (aw <= 0 || ah <= 0) return 0; + + ap = (int)a->device_pitch; + ealpha = ov->opacity; + + dst_y = (CUdeviceptr)out->data[0]; + { + int dst_pitch_y = out->linesize[0]; + CUdeviceptr atl = a->device_ptr; + void *args[] = { &dst_y, &dst_pitch_y, &px, &py, &atl, &ap, &aw, &ah, &ealpha }; + ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuLaunchKernel( + s->cu_func_alpha_blit_rgba_y, + DIV_UP(aw, BLOCKX), DIV_UP(ah, BLOCKY), 1, + BLOCKX, BLOCKY, 1, + 0, s->cu_stream, args, NULL)); + if (ret < 0) return ret; + } + dst_uv = (CUdeviceptr)out->data[1]; + { + int dst_pitch_uv = out->linesize[1]; + CUdeviceptr atl = a->device_ptr; + void *args[] = { &dst_uv, &dst_pitch_uv, &px, &py, &atl, &ap, &aw, &ah, &ealpha }; + ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuLaunchKernel( + s->cu_func_alpha_blit_rgba_uv, + DIV_UP(aw / 2, BLOCKX), DIV_UP(ah / 2, BLOCKY), 1, + BLOCKX, BLOCKY, 1, + 0, s->cu_stream, args, NULL)); + if (ret < 0) return ret; + } + return 0; +} + static int render_overlay_rect(AVFilterContext *ctx, AVFrame *out, const GridOverlay *ov) { CudaGridContext *s = ctx->priv; @@ -947,7 +1208,8 @@ static int render_overlays(AVFilterContext *ctx, AVFrame *out) #endif break; case OV_TYPE_ICON: - av_log(ctx, AV_LOG_TRACE, "overlay %s: icon — Phase 4b-4 (sprite)\n", ov->id); + ret = render_overlay_icon(ctx, out, ov); + if (ret < 0) return ret; break; } } @@ -1107,6 +1369,13 @@ static av_cold void cuda_grid_uninit(AVFilterContext *ctx) } } s->nb_atlases = 0; + for (i = 0; i < s->nb_icon_atlases; i++) { + if (s->icon_atlases[i].device_ptr) { + s->hwctx->internal->cuda_dl->cuMemFree(s->icon_atlases[i].device_ptr); + s->icon_atlases[i].device_ptr = 0; + } + } + s->nb_icon_atlases = 0; if (s->cu_module) { CHECK_CU(s->hwctx->internal->cuda_dl->cuModuleUnload(s->cu_module)); s->cu_module = NULL; @@ -1340,6 +1609,8 @@ static const AVOption cuda_grid_options[] = { OFFSET(font_file), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, FLAGS }, { "font_size", "default text overlay size (px), используется если overlay не указал свой", OFFSET(default_font_size), AV_OPT_TYPE_INT, { .i64 = 24 }, 6, 256, FLAGS }, + { "icon_dir", "directory для resolve icon_name (по умолчанию /var/lib/cuda-grid/icons)", + OFFSET(icon_dir), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, FLAGS }, { NULL } }; -- 2.52.0 From a326ef146ca510eb0a5d615e5cc33f731b5d79d3 Mon Sep 17 00:00:00 2001 From: gx Date: Wed, 20 May 2026 21:48:09 +0100 Subject: [PATCH 13/14] =?UTF-8?q?vf=5Fcuda=5Fgrid:=20Phase=206=20=E2=80=94?= =?UTF-8?q?=20process=5Fcommand=20'reload=5Ficon=20'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Invalidate cached icon atlas by name. Next render автоматически re-reads PNG file с disk. Used controller'ом для periodic re-render dynamic overlays (charts/chats) — write new PNG → send reload_icon → filter подхватывает. Без этого pattern dynamic overlay невозможен — icon overlay cached forever after first load by icon_name. Co-Authored-By: Claude Opus 4.7 --- libavfilter/vf_cuda_grid.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/libavfilter/vf_cuda_grid.c b/libavfilter/vf_cuda_grid.c index 32c4922..3bb2852 100644 --- a/libavfilter/vf_cuda_grid.c +++ b/libavfilter/vf_cuda_grid.c @@ -1450,6 +1450,39 @@ static int cuda_grid_process_command(AVFilterContext *ctx, const char *cmd, return 0; } + if (!strcmp(cmd, "reload_icon")) { + /* Invalidate cached icon atlas by name — next render re-reads file from disk. + * Used Phase 6 by controller для periodic chart/chat re-rendering. */ + char name[OVERLAY_ICON_NAME_MAX]; + int i, found = -1; + if (!arg || sscanf(arg, "%31s", name) != 1) { + if (res) av_strlcpy(res, "err parse", res_len); + return AVERROR(EINVAL); + } + pthread_mutex_lock(&s->overlay_lock); + if (s->hwctx) { + CUcontext dummy; + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPushCurrent(s->cu_ctx)); + for (i = 0; i < s->nb_icon_atlases; i++) { + if (!strcmp(s->icon_atlases[i].icon_name, name)) { + if (s->icon_atlases[i].device_ptr) { + s->hwctx->internal->cuda_dl->cuMemFree(s->icon_atlases[i].device_ptr); + s->icon_atlases[i].device_ptr = 0; + } + memmove(&s->icon_atlases[i], &s->icon_atlases[i + 1], + (s->nb_icon_atlases - i - 1) * sizeof(IconAtlas)); + s->nb_icon_atlases--; + found = i; + break; + } + } + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); + } + pthread_mutex_unlock(&s->overlay_lock); + if (res) snprintf(res, res_len, found >= 0 ? "ok %s reloaded" : "not_cached %s", name); + return 0; + } + /* Future: set_layout (Phase 3 на filter side — пока nb_inputs хардкодится в init) */ /* Fall back to standard option/command handling */ -- 2.52.0 From 636bd7885410539595ef45c7e8dc36461ad04686 Mon Sep 17 00:00:00 2001 From: gx Date: Thu, 21 May 2026 07:08:23 +0100 Subject: [PATCH 14/14] =?UTF-8?q?vf=5Fcuda=5Fgrid:=20overlay=20=D1=81=20in?= =?UTF-8?q?valid=20cell=20=E2=80=94=20silent=20skip=20=D0=B2=D0=BC=D0=B5?= =?UTF-8?q?=D1=81=D1=82=D0=BE=20AVERROR?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Когда controller broadcast'ит overlay command ко всем cuda_grid instances (quad/single/mpp), overlay с cell=2 (gate_lpr cell в quad/mpp) попадает в single layout который имеет nb_cells=1 → overlay_pixel_rect возвращал AVERROR(EINVAL) → render_overlays propagated к cuda_grid_compose → "Error while filtering: Invalid argument" каждый frame → pipeline crash loop. Fix: return 1 (positive skip) вместо AVERROR. render_overlay_rect уже имеет `if (ret != 0) return ret < 0 ? ret : 0;` — positive skip правильно обрабатывается без error. Co-Authored-By: Claude Opus 4.7 --- libavfilter/vf_cuda_grid.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/libavfilter/vf_cuda_grid.c b/libavfilter/vf_cuda_grid.c index 3bb2852..6367239 100644 --- a/libavfilter/vf_cuda_grid.c +++ b/libavfilter/vf_cuda_grid.c @@ -476,7 +476,10 @@ static int overlay_pixel_rect(CudaGridContext *s, const GridOverlay *ov, base_w = s->cell_px[ov->cell].w; base_h = s->cell_px[ov->cell].h; } else { - return AVERROR(EINVAL); + /* Cell index out of range — silently skip (overlay broadcast'нет ко всем + * cuda_grid instances, layouts с меньшим nb_cells). NOT error чтобы + * не валить filter chain. */ + return 1; } rx = base_x + (int)(ov->x * base_w); -- 2.52.0