From 097ca816057223e45dd66b08fdebd01d09b97816 Mon Sep 17 00:00:00 2001
From: gx <gx@goldix.org>
Date: Tue, 19 May 2026 20:47:00 +0100
Subject: [PATCH 01/14] =?UTF-8?q?vf=5Fcuda=5Fgrid:=20Phase=201=20MVP=20?=
 =?UTF-8?q?=E2=80=94=20fixed=20quad=20layout,=204=20CUDA=20inputs=20?=
 =?UTF-8?q?=E2=86=92=201=20output?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase 1 deliverable (см. gx/vf-cuda-grid#1):
- libavfilter/vf_cuda_grid.c (~270 LOC): multi-input filter, fixed 2×2 quad
- 4 NV12 CUDA frames same size → 2W × 2H output frame
- Composition: cuMemcpy2DAsync per Y + UV plane на каждый input
- framesync для lock-step pull всех 4 inputs
- Output hw_frames_ctx allocated from input device_ref
- Build wiring: CONFIG_CUDA_GRID_FILTER → libavfilter/{Makefile,allfilters.c}, configure deps на ffnvcodec

Limitations Phase 1:
- All inputs must be same size (no scaling)
- Quad layout hardcoded (no DSL, no runtime switching)
- NV12 only (no RGBA/YUV420P)

Phase 2: dynamic layouts + scaling. Phase 3: runtime control via process_command.
---
 configure                  |   1 +
 libavfilter/Makefile       |   1 +
 libavfilter/allfilters.c   |   1 +
 libavfilter/vf_cuda_grid.c | 359 +++++++++++++++++++++++++++++++++++++
 4 files changed, 362 insertions(+)
 create mode 100644 libavfilter/vf_cuda_grid.c

diff --git a/configure b/configure
index c24aa94..9c60cb7 100755
--- a/configure
+++ b/configure
@@ -3317,6 +3317,7 @@ thumbnail_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 transpose_npp_filter_deps="ffnvcodec libnpp"
 overlay_cuda_filter_deps="ffnvcodec"
 overlay_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
+cuda_grid_filter_deps="ffnvcodec"
 sharpen_npp_filter_deps="ffnvcodec libnpp"
 
 ddagrab_filter_deps="d3d11va IDXGIOutput1 DXGI_OUTDUPL_FRAME_INFO"
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index 91487af..8015a49 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -410,6 +410,7 @@ OBJS-$(CONFIG_OSCILLOSCOPE_FILTER)           += vf_datascope.o
 OBJS-$(CONFIG_OVERLAY_FILTER)                += vf_overlay.o framesync.o
 OBJS-$(CONFIG_OVERLAY_CUDA_FILTER)           += vf_overlay_cuda.o framesync.o vf_overlay_cuda.ptx.o \
                                                 cuda/load_helper.o
+OBJS-$(CONFIG_CUDA_GRID_FILTER)              += vf_cuda_grid.o framesync.o
 OBJS-$(CONFIG_OVERLAY_OPENCL_FILTER)         += vf_overlay_opencl.o opencl.o \
                                                 opencl/overlay.o framesync.o
 OBJS-$(CONFIG_OVERLAY_QSV_FILTER)            += vf_overlay_qsv.o framesync.o
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index 9819f0f..bc0d00a 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -390,6 +390,7 @@ extern const AVFilter ff_vf_overlay_qsv;
 extern const AVFilter ff_vf_overlay_vaapi;
 extern const AVFilter ff_vf_overlay_vulkan;
 extern const AVFilter ff_vf_overlay_cuda;
+extern const AVFilter ff_vf_cuda_grid;
 extern const AVFilter ff_vf_owdenoise;
 extern const AVFilter ff_vf_pad;
 extern const AVFilter ff_vf_pad_opencl;
diff --git a/libavfilter/vf_cuda_grid.c b/libavfilter/vf_cuda_grid.c
new file mode 100644
index 0000000..34003fe
--- /dev/null
+++ b/libavfilter/vf_cuda_grid.c
@@ -0,0 +1,359 @@
+/*
+ * cuda_grid — GPU-native video grid composer для FFmpeg 7.x.
+ *
+ * Принимает N CUDA-frames на входе, выдаёт один composed frame с N cells
+ * в layout. End-to-end CUDA (без CPU round-trip).
+ *
+ * Phase 1 (MVP): fixed quad layout 2×2, 4 NV12-inputs одинакового размера,
+ * output size = 2W × 2H, без scaling. Композиция через cuMemcpy2DAsync per
+ * Y/UV plane на каждый input → soответствующую quadrant'у output.
+ *
+ * Future phases (см. gx/vf-cuda-grid#1):
+ *  - Phase 2: dynamic layouts + per-cell scaling
+ *  - Phase 3: runtime layout switching через process_command (ZMQ)
+ *  - Phase 4+: overlay primitives (rect/text/icon/image/dim/graph/chat)
+ *
+ * Лицензия: LGPL-2.1+ (соответствует FFmpeg)
+ */
+
+#include "config_components.h"
+
+#include "libavutil/common.h"
+#include "libavutil/hwcontext.h"
+#include "libavutil/hwcontext_cuda_internal.h"
+#include "libavutil/log.h"
+#include "libavutil/mem.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+
+#include "avfilter.h"
+#include "filters.h"
+#include "formats.h"
+#include "framesync.h"
+#include "video.h"
+
+#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x)
+
+#define CUDA_GRID_INPUTS 4  /* Phase 1: fixed quad */
+
+typedef struct CudaGridContext {
+    const AVClass *class;
+
+    AVBufferRef *hw_device_ctx;
+    AVCUDADeviceContext *hwctx;
+    CUcontext cu_ctx;
+    CUstream  cu_stream;
+
+    FFFrameSync fs;
+
+    /* Output dimensions (computed in config_output) */
+    int out_width;
+    int out_height;
+
+    /* Per-cell target rectangles в output frame.
+     * Phase 1 hardcode: 4 ячейки 2×2 (top-left, top-right, bottom-left, bottom-right). */
+    struct {
+        int x, y, w, h;
+    } cells[CUDA_GRID_INPUTS];
+} CudaGridContext;
+
+/* ─── Composition: copy одного input plane в target region output ──────── */
+
+static int copy_input_plane(AVFilterContext *ctx,
+                            CUdeviceptr     src_data,
+                            int             src_pitch,
+                            int             src_w,
+                            int             src_h,
+                            CUdeviceptr     dst_data,
+                            int             dst_pitch,
+                            int             dst_x,
+                            int             dst_y,
+                            int             bytes_per_pixel)
+{
+    CudaGridContext *s = ctx->priv;
+    CUDA_MEMCPY2D cpy = {
+        .srcMemoryType = CU_MEMORYTYPE_DEVICE,
+        .srcDevice     = src_data,
+        .srcPitch      = src_pitch,
+        .dstMemoryType = CU_MEMORYTYPE_DEVICE,
+        .dstDevice     = dst_data,
+        .dstXInBytes   = (size_t)dst_x * bytes_per_pixel,
+        .dstY          = dst_y,
+        .dstPitch      = dst_pitch,
+        .WidthInBytes  = (size_t)src_w * bytes_per_pixel,
+        .Height        = src_h,
+    };
+    return CHECK_CU(s->hwctx->internal->cuda_dl->cuMemcpy2DAsync(&cpy, s->cu_stream));
+}
+
+/* ─── Framesync callback — N frames аre ready, compose ────────────────── */
+
+static int cuda_grid_compose(FFFrameSync *fs)
+{
+    AVFilterContext *ctx     = fs->parent;
+    AVFilterLink    *outlink = ctx->outputs[0];
+    CudaGridContext *s       = ctx->priv;
+    AVFrame *out = NULL;
+    AVFrame *in[CUDA_GRID_INPUTS] = {0};
+    CUcontext dummy;
+    int ret;
+
+    /* Сбор всех N input frames из framesync */
+    for (int i = 0; i < CUDA_GRID_INPUTS; i++) {
+        ret = ff_framesync_get_frame(fs, i, &in[i], 0);
+        if (ret < 0)
+            return ret;
+        if (!in[i]) {
+            av_log(ctx, AV_LOG_WARNING, "input %d not ready, skipping frame\n", i);
+            return 0;
+        }
+    }
+
+    /* Output frame из output's hw_frames_pool */
+    out = ff_get_video_buffer(outlink, s->out_width, s->out_height);
+    if (!out)
+        return AVERROR(ENOMEM);
+
+    /* Copy props (timestamps, color metadata) от первого input */
+    ret = av_frame_copy_props(out, in[0]);
+    if (ret < 0)
+        goto fail;
+    out->width  = s->out_width;
+    out->height = s->out_height;
+
+    /* CUDA context push для всех cuMemcpy в этом filter call */
+    ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPushCurrent(s->cu_ctx));
+    if (ret < 0)
+        goto fail;
+
+    /* Для каждого input — copy Y plane + UV plane в свою quadrant.
+     * NV12 layout: data[0] = Y, data[1] = UV interleaved. linesize[0/1] = pitch. */
+    for (int i = 0; i < CUDA_GRID_INPUTS; i++) {
+        AVFrame *src = in[i];
+        int cx = s->cells[i].x;
+        int cy = s->cells[i].y;
+        int cw = s->cells[i].w;
+        int ch = s->cells[i].h;
+
+        if (src->width != cw || src->height != ch) {
+            av_log(ctx, AV_LOG_ERROR,
+                   "input %d size %dx%d != expected cell size %dx%d "
+                   "(Phase 1: no scaling, all inputs must match cell size)\n",
+                   i, src->width, src->height, cw, ch);
+            CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
+            ret = AVERROR(EINVAL);
+            goto fail;
+        }
+
+        /* Y plane (full resolution, 1 byte per pixel) */
+        ret = copy_input_plane(ctx,
+                               (CUdeviceptr)src->data[0], src->linesize[0],
+                               src->width, src->height,
+                               (CUdeviceptr)out->data[0], out->linesize[0],
+                               cx, cy, 1);
+        if (ret < 0) {
+            CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
+            goto fail;
+        }
+
+        /* UV plane (half resolution для NV12, но 2 bytes per "pixel" — interleaved UV) */
+        ret = copy_input_plane(ctx,
+                               (CUdeviceptr)src->data[1], src->linesize[1],
+                               src->width / 2, src->height / 2,
+                               (CUdeviceptr)out->data[1], out->linesize[1],
+                               cx / 2, cy / 2, 2);
+        if (ret < 0) {
+            CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
+            goto fail;
+        }
+    }
+
+    CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
+
+    return ff_filter_frame(outlink, out);
+
+fail:
+    av_frame_free(&out);
+    return ret;
+}
+
+/* ─── Lifecycle: init / uninit / query_formats / config_input / config_output ─ */
+
+static av_cold int cuda_grid_init(AVFilterContext *ctx)
+{
+    CudaGridContext *s = ctx->priv;
+    /* Сами inputs регистрируем в filter struct (см. внизу — нельзя AVFILTER_INPUT_COUNT_MAX
+     * без явных AVFilterPad'ов). Phase 1 fix=4. */
+    (void)s;
+    return 0;
+}
+
+static av_cold void cuda_grid_uninit(AVFilterContext *ctx)
+{
+    CudaGridContext *s = ctx->priv;
+    ff_framesync_uninit(&s->fs);
+    av_buffer_unref(&s->hw_device_ctx);
+}
+
+static int cuda_grid_config_input(AVFilterLink *inlink)
+{
+    AVFilterContext *ctx = inlink->src;
+    FilterLink      *inl = ff_filter_link(inlink);
+
+    if (!inl->hw_frames_ctx || !inl->hw_frames_ctx->data) {
+        av_log(ctx, AV_LOG_ERROR, "input %d: software pixel format не поддерживается\n",
+               FF_INLINK_IDX(inlink));
+        return AVERROR(EINVAL);
+    }
+    return 0;
+}
+
+static int cuda_grid_config_output(AVFilterLink *outlink)
+{
+    AVFilterContext     *ctx  = outlink->src;
+    CudaGridContext     *s    = ctx->priv;
+    AVFilterLink        *in0  = ctx->inputs[0];
+    FilterLink          *inl0 = ff_filter_link(in0);
+    FilterLink          *outl = ff_filter_link(outlink);
+    AVHWFramesContext   *hwfc0;
+    int W, H, ret;
+
+    if (!inl0->hw_frames_ctx)
+        return AVERROR(EINVAL);
+    hwfc0 = (AVHWFramesContext *)inl0->hw_frames_ctx->data;
+
+    if (hwfc0->sw_format != AV_PIX_FMT_NV12) {
+        av_log(ctx, AV_LOG_ERROR,
+               "Phase 1 supports only NV12, got %s\n",
+               av_get_pix_fmt_name(hwfc0->sw_format));
+        return AVERROR(EINVAL);
+    }
+
+    /* Все inputs должны иметь одинаковый device и sw_format (Phase 1 also same size) */
+    W = in0->w;
+    H = in0->h;
+    for (int i = 1; i < CUDA_GRID_INPUTS; i++) {
+        AVFilterLink *inN = ctx->inputs[i];
+        FilterLink   *ilN = ff_filter_link(inN);
+        AVHWFramesContext *hN;
+        if (!ilN->hw_frames_ctx)
+            return AVERROR(EINVAL);
+        hN = (AVHWFramesContext *)ilN->hw_frames_ctx->data;
+        if (hN->device_ctx != hwfc0->device_ctx) {
+            av_log(ctx, AV_LOG_ERROR, "input %d device mismatch\n", i);
+            return AVERROR(EINVAL);
+        }
+        if (hN->sw_format != hwfc0->sw_format) {
+            av_log(ctx, AV_LOG_ERROR, "input %d sw_format mismatch\n", i);
+            return AVERROR(EINVAL);
+        }
+        if (inN->w != W || inN->h != H) {
+            av_log(ctx, AV_LOG_ERROR,
+                   "Phase 1: input %d size %dx%d != input 0 size %dx%d. "
+                   "В этой фазе scaling не поддерживается, все inputs должны быть одного размера.\n",
+                   i, inN->w, inN->h, W, H);
+            return AVERROR(EINVAL);
+        }
+    }
+
+    /* Output = 2W × 2H для quad layout */
+    s->out_width  = 2 * W;
+    s->out_height = 2 * H;
+    outlink->w = s->out_width;
+    outlink->h = s->out_height;
+
+    /* Hardcoded quad cell positions */
+    s->cells[0].x = 0;     s->cells[0].y = 0;     s->cells[0].w = W; s->cells[0].h = H;
+    s->cells[1].x = W;     s->cells[1].y = 0;     s->cells[1].w = W; s->cells[1].h = H;
+    s->cells[2].x = 0;     s->cells[2].y = H;     s->cells[2].w = W; s->cells[2].h = H;
+    s->cells[3].x = W;     s->cells[3].y = H;     s->cells[3].w = W; s->cells[3].h = H;
+
+    /* Setup CUDA device + stream context из input 0 */
+    AVHWDeviceContext *hwdev = hwfc0->device_ctx;
+    s->hwctx    = (AVCUDADeviceContext *)hwdev->hwctx;
+    s->cu_ctx   = s->hwctx->cuda_ctx;
+    s->cu_stream = s->hwctx->stream;
+
+    /* Аллокация output hw_frames_ctx — copy от input #0 с обновлёнными размерами */
+    AVBufferRef *out_ref = av_hwframe_ctx_alloc(hwfc0->device_ref);
+    if (!out_ref)
+        return AVERROR(ENOMEM);
+    AVHWFramesContext *out_hwfc = (AVHWFramesContext *)out_ref->data;
+    out_hwfc->format    = AV_PIX_FMT_CUDA;
+    out_hwfc->sw_format = AV_PIX_FMT_NV12;
+    out_hwfc->width     = s->out_width;
+    out_hwfc->height    = s->out_height;
+    out_hwfc->initial_pool_size = 4;
+
+    ret = av_hwframe_ctx_init(out_ref);
+    if (ret < 0) {
+        av_buffer_unref(&out_ref);
+        return ret;
+    }
+    outl->hw_frames_ctx = out_ref;
+
+    /* Setup framesync для lock-step pull от N inputs */
+    ret = ff_framesync_init(&s->fs, ctx, CUDA_GRID_INPUTS);
+    if (ret < 0)
+        return ret;
+    for (int i = 0; i < CUDA_GRID_INPUTS; i++) {
+        FFFrameSyncIn *fs_in = &s->fs.in[i];
+        fs_in->time_base = ctx->inputs[i]->time_base;
+        fs_in->sync   = 1;
+        fs_in->before = EXT_STOP;
+        fs_in->after  = EXT_INFINITY;
+    }
+    s->fs.opaque   = s;
+    s->fs.on_event = cuda_grid_compose;
+
+    outlink->time_base = ctx->inputs[0]->time_base;
+
+    return ff_framesync_configure(&s->fs);
+}
+
+static int cuda_grid_activate(AVFilterContext *ctx)
+{
+    CudaGridContext *s = ctx->priv;
+    return ff_framesync_activate(&s->fs);
+}
+
+/* ─── Filter registration ──────────────────────────────────────────────── */
+
+static const AVOption cuda_grid_options[] = {
+    /* Phase 1: no options. Phase 2 добавит `layout=`. */
+    { NULL }
+};
+
+static const AVClass cuda_grid_class = {
+    .class_name = "cuda_grid",
+    .item_name  = av_default_item_name,
+    .option     = cuda_grid_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+    .category   = AV_CLASS_CATEGORY_FILTER,
+};
+
+static const AVFilterPad cuda_grid_inputs[] = {
+    { .name = "input0", .type = AVMEDIA_TYPE_VIDEO, .config_props = cuda_grid_config_input },
+    { .name = "input1", .type = AVMEDIA_TYPE_VIDEO, .config_props = cuda_grid_config_input },
+    { .name = "input2", .type = AVMEDIA_TYPE_VIDEO, .config_props = cuda_grid_config_input },
+    { .name = "input3", .type = AVMEDIA_TYPE_VIDEO, .config_props = cuda_grid_config_input },
+};
+
+static const AVFilterPad cuda_grid_outputs[] = {
+    { .name = "default", .type = AVMEDIA_TYPE_VIDEO, .config_props = cuda_grid_config_output },
+};
+
+const AVFilter ff_vf_cuda_grid = {
+    .name           = "cuda_grid",
+    .description    = NULL_IF_CONFIG_SMALL("GPU-native video grid composer (CUDA)."),
+    .priv_class     = &cuda_grid_class,
+    .priv_size      = sizeof(CudaGridContext),
+    .init           = cuda_grid_init,
+    .uninit         = cuda_grid_uninit,
+    .activate       = cuda_grid_activate,
+    FILTER_INPUTS(cuda_grid_inputs),
+    FILTER_OUTPUTS(cuda_grid_outputs),
+    FILTER_SINGLE_PIXFMT(AV_PIX_FMT_CUDA),
+    .flags          = AVFILTER_FLAG_HWDEVICE,
+    .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
+};
-- 
2.52.0


From 4313c3f30d3adac36d73d239ff85069a30632e2b Mon Sep 17 00:00:00 2001
From: gx <gx@goldix.org>
Date: Tue, 19 May 2026 20:50:17 +0100
Subject: [PATCH 02/14] vf_cuda_grid: fix #include cuda_check.h + mixed decl
 warnings (-Werror)

---
 libavfilter/vf_cuda_grid.c | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/libavfilter/vf_cuda_grid.c b/libavfilter/vf_cuda_grid.c
index 34003fe..60a5ccd 100644
--- a/libavfilter/vf_cuda_grid.c
+++ b/libavfilter/vf_cuda_grid.c
@@ -19,6 +19,7 @@
 #include "config_components.h"
 
 #include "libavutil/common.h"
+#include "libavutil/cuda_check.h"
 #include "libavutil/hwcontext.h"
 #include "libavutil/hwcontext_cuda_internal.h"
 #include "libavutil/log.h"
@@ -201,8 +202,8 @@ static int cuda_grid_config_input(AVFilterLink *inlink)
     FilterLink      *inl = ff_filter_link(inlink);
 
     if (!inl->hw_frames_ctx || !inl->hw_frames_ctx->data) {
-        av_log(ctx, AV_LOG_ERROR, "input %d: software pixel format не поддерживается\n",
-               FF_INLINK_IDX(inlink));
+        av_log(ctx, AV_LOG_ERROR, "input %s: software pixel format не поддерживается\n",
+               inlink->dstpad->name);
         return AVERROR(EINVAL);
     }
     return 0;
@@ -216,6 +217,9 @@ static int cuda_grid_config_output(AVFilterLink *outlink)
     FilterLink          *inl0 = ff_filter_link(in0);
     FilterLink          *outl = ff_filter_link(outlink);
     AVHWFramesContext   *hwfc0;
+    AVHWDeviceContext   *hwdev;
+    AVBufferRef         *out_ref;
+    AVHWFramesContext   *out_hwfc;
     int W, H, ret;
 
     if (!inl0->hw_frames_ctx)
@@ -269,16 +273,16 @@ static int cuda_grid_config_output(AVFilterLink *outlink)
     s->cells[3].x = W;     s->cells[3].y = H;     s->cells[3].w = W; s->cells[3].h = H;
 
     /* Setup CUDA device + stream context из input 0 */
-    AVHWDeviceContext *hwdev = hwfc0->device_ctx;
+    hwdev = hwfc0->device_ctx;
     s->hwctx    = (AVCUDADeviceContext *)hwdev->hwctx;
     s->cu_ctx   = s->hwctx->cuda_ctx;
     s->cu_stream = s->hwctx->stream;
 
     /* Аллокация output hw_frames_ctx — copy от input #0 с обновлёнными размерами */
-    AVBufferRef *out_ref = av_hwframe_ctx_alloc(hwfc0->device_ref);
+    out_ref = av_hwframe_ctx_alloc(hwfc0->device_ref);
     if (!out_ref)
         return AVERROR(ENOMEM);
-    AVHWFramesContext *out_hwfc = (AVHWFramesContext *)out_ref->data;
+    out_hwfc = (AVHWFramesContext *)out_ref->data;
     out_hwfc->format    = AV_PIX_FMT_CUDA;
     out_hwfc->sw_format = AV_PIX_FMT_NV12;
     out_hwfc->width     = s->out_width;
@@ -296,12 +300,16 @@ static int cuda_grid_config_output(AVFilterLink *outlink)
     ret = ff_framesync_init(&s->fs, ctx, CUDA_GRID_INPUTS);
     if (ret < 0)
         return ret;
-    for (int i = 0; i < CUDA_GRID_INPUTS; i++) {
-        FFFrameSyncIn *fs_in = &s->fs.in[i];
-        fs_in->time_base = ctx->inputs[i]->time_base;
-        fs_in->sync   = 1;
-        fs_in->before = EXT_STOP;
-        fs_in->after  = EXT_INFINITY;
+    {
+        int i;
+        FFFrameSyncIn *fs_in;
+        for (i = 0; i < CUDA_GRID_INPUTS; i++) {
+            fs_in = &s->fs.in[i];
+            fs_in->time_base = ctx->inputs[i]->time_base;
+            fs_in->sync   = 1;
+            fs_in->before = EXT_STOP;
+            fs_in->after  = EXT_INFINITY;
+        }
     }
     s->fs.opaque   = s;
     s->fs.on_event = cuda_grid_compose;
-- 
2.52.0


From 6ee2f474c7624d1414acf9e94702ba3974817e18 Mon Sep 17 00:00:00 2001
From: gx <gx@goldix.org>
Date: Tue, 19 May 2026 20:57:08 +0100
Subject: [PATCH 03/14] =?UTF-8?q?vf=5Fcuda=5Fgrid:=20Phase=202a=20?=
 =?UTF-8?q?=E2=80=94=20layout=20templates=20+=20dynamic=20nb=5Finputs?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Layout templates (9):
- single, dual_horizontal, dual_vertical
- quad (default), main_plus_preview (1 big + 3 small)
- six_grid (3x2), nine_grid (3x3), sixteen_grid (4x4)
- panoramic

Cells определены в normalized координатах (0.0-1.0), переводятся в pixels
в config_output (× out_w/out_h). Alignment до chroma boundary (NV12 ÷ 2).

Filter options:
- layout=<name> (default quad)
- out_w=<int> (default 1920)
- out_h=<int> (default 1080)

Dynamic inputs:
- nb_inputs derived из layout (single=1, quad=4, nine_grid=9, sixteen_grid=16)
- ff_append_inpad_free_name в init() для каждой cell
- AVFILTER_FLAG_DYNAMIC_INPUTS на filter

Phase 2a limitation:
- Каждый input должен быть точно cell_px size (no scaling).
- Phase 2b добавит NPP resize для mixed-size inputs.
---
 libavfilter/vf_cuda_grid.c | 254 +++++++++++++++++++++++++------------
 1 file changed, 176 insertions(+), 78 deletions(-)

diff --git a/libavfilter/vf_cuda_grid.c b/libavfilter/vf_cuda_grid.c
index 60a5ccd..9537b54 100644
--- a/libavfilter/vf_cuda_grid.c
+++ b/libavfilter/vf_cuda_grid.c
@@ -4,12 +4,13 @@
  * Принимает N CUDA-frames на входе, выдаёт один composed frame с N cells
  * в layout. End-to-end CUDA (без CPU round-trip).
  *
- * Phase 1 (MVP): fixed quad layout 2×2, 4 NV12-inputs одинакового размера,
- * output size = 2W × 2H, без scaling. Композиция через cuMemcpy2DAsync per
- * Y/UV plane на каждый input → soответствующую quadrant'у output.
+ * Phase 2a: layout templates (single/dual_h/dual_v/quad/main_plus_preview/
+ * six_grid/nine_grid/sixteen_grid/panoramic), dynamic nb_inputs, output size
+ * через option (default 1920×1080). Cell rects = normalized × output size.
+ * **Scaling пока нет** — каждый input должен быть точно cell size (Phase 2b NPP).
  *
  * Future phases (см. gx/vf-cuda-grid#1):
- *  - Phase 2: dynamic layouts + per-cell scaling
+ *  - Phase 2b: per-cell scaling через libnpp (mixed-size inputs)
  *  - Phase 3: runtime layout switching через process_command (ZMQ)
  *  - Phase 4+: overlay primitives (rect/text/icon/image/dim/graph/chat)
  *
@@ -34,28 +35,115 @@
 #include "video.h"
 
 #define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x)
+#define MAX_CELLS 16
 
-#define CUDA_GRID_INPUTS 4  /* Phase 1: fixed quad */
+/* ─── Layout templates (normalized координаты 0.0–1.0) ─────────────────── */
+
+typedef struct LayoutCell {
+    float x, y, w, h;   /* normalized fraction of output size */
+} LayoutCell;
+
+typedef struct LayoutTemplate {
+    const char *name;
+    int nb_cells;
+    LayoutCell cells[MAX_CELLS];
+} LayoutTemplate;
+
+/* Layouts copy-paste'нуты по структуре cctv-processor config/grids.json. */
+static const LayoutTemplate layouts[] = {
+    {
+        "single", 1,
+        { {0.0f, 0.0f, 1.0f, 1.0f} }
+    },
+    {
+        "dual_horizontal", 2,
+        { {0.0f, 0.0f, 0.5f, 1.0f}, {0.5f, 0.0f, 0.5f, 1.0f} }
+    },
+    {
+        "dual_vertical", 2,
+        { {0.0f, 0.0f, 1.0f, 0.5f}, {0.0f, 0.5f, 1.0f, 0.5f} }
+    },
+    {
+        "quad", 4,
+        {
+            {0.0f, 0.0f, 0.5f, 0.5f}, {0.5f, 0.0f, 0.5f, 0.5f},
+            {0.0f, 0.5f, 0.5f, 0.5f}, {0.5f, 0.5f, 0.5f, 0.5f},
+        }
+    },
+    {
+        /* Main camera 2/3 width, 3 small cameras stacked справа сверху вниз */
+        "main_plus_preview", 4,
+        {
+            {0.0f,      0.0f, 2.0f/3, 1.0f},
+            {2.0f/3,    0.0f, 1.0f/3, 1.0f/3},
+            {2.0f/3, 1.0f/3, 1.0f/3, 1.0f/3},
+            {2.0f/3, 2.0f/3, 1.0f/3, 1.0f/3},
+        }
+    },
+    {
+        "six_grid", 6,
+        {
+            {0.0f, 0.0f, 1.0f/3, 0.5f}, {1.0f/3, 0.0f, 1.0f/3, 0.5f}, {2.0f/3, 0.0f, 1.0f/3, 0.5f},
+            {0.0f, 0.5f, 1.0f/3, 0.5f}, {1.0f/3, 0.5f, 1.0f/3, 0.5f}, {2.0f/3, 0.5f, 1.0f/3, 0.5f},
+        }
+    },
+    {
+        "nine_grid", 9,
+        {
+            {0.0f, 0.0f, 1.0f/3, 1.0f/3}, {1.0f/3, 0.0f, 1.0f/3, 1.0f/3}, {2.0f/3, 0.0f, 1.0f/3, 1.0f/3},
+            {0.0f, 1.0f/3, 1.0f/3, 1.0f/3}, {1.0f/3, 1.0f/3, 1.0f/3, 1.0f/3}, {2.0f/3, 1.0f/3, 1.0f/3, 1.0f/3},
+            {0.0f, 2.0f/3, 1.0f/3, 1.0f/3}, {1.0f/3, 2.0f/3, 1.0f/3, 1.0f/3}, {2.0f/3, 2.0f/3, 1.0f/3, 1.0f/3},
+        }
+    },
+    {
+        "sixteen_grid", 16,
+        {
+            {0.00f, 0.00f, 0.25f, 0.25f}, {0.25f, 0.00f, 0.25f, 0.25f}, {0.50f, 0.00f, 0.25f, 0.25f}, {0.75f, 0.00f, 0.25f, 0.25f},
+            {0.00f, 0.25f, 0.25f, 0.25f}, {0.25f, 0.25f, 0.25f, 0.25f}, {0.50f, 0.25f, 0.25f, 0.25f}, {0.75f, 0.25f, 0.25f, 0.25f},
+            {0.00f, 0.50f, 0.25f, 0.25f}, {0.25f, 0.50f, 0.25f, 0.25f}, {0.50f, 0.50f, 0.25f, 0.25f}, {0.75f, 0.50f, 0.25f, 0.25f},
+            {0.00f, 0.75f, 0.25f, 0.25f}, {0.25f, 0.75f, 0.25f, 0.25f}, {0.50f, 0.75f, 0.25f, 0.25f}, {0.75f, 0.75f, 0.25f, 0.25f},
+        }
+    },
+    {
+        /* Один widescreen panoramic — 1 cell full width */
+        "panoramic", 1,
+        { {0.0f, 0.0f, 1.0f, 1.0f} }
+    },
+};
+
+static const LayoutTemplate *find_layout(const char *name)
+{
+    for (size_t i = 0; i < FF_ARRAY_ELEMS(layouts); i++) {
+        if (!strcmp(layouts[i].name, name))
+            return &layouts[i];
+    }
+    return NULL;
+}
+
+/* ─── Filter state ─────────────────────────────────────────────────────── */
 
 typedef struct CudaGridContext {
     const AVClass *class;
 
-    AVBufferRef *hw_device_ctx;
+    /* Options */
+    char *layout_name;
+    int   out_width;
+    int   out_height;
+
+    /* Resolved layout (после init) */
+    const LayoutTemplate *layout;
+
+    /* CUDA */
     AVCUDADeviceContext *hwctx;
     CUcontext cu_ctx;
     CUstream  cu_stream;
 
     FFFrameSync fs;
 
-    /* Output dimensions (computed in config_output) */
-    int out_width;
-    int out_height;
-
-    /* Per-cell target rectangles в output frame.
-     * Phase 1 hardcode: 4 ячейки 2×2 (top-left, top-right, bottom-left, bottom-right). */
+    /* Per-cell pixel rects (computed в config_output из normalized × out size) */
     struct {
         int x, y, w, h;
-    } cells[CUDA_GRID_INPUTS];
+    } cell_px[MAX_CELLS];
 } CudaGridContext;
 
 /* ─── Composition: copy одного input plane в target region output ──────── */
@@ -87,7 +175,7 @@ static int copy_input_plane(AVFilterContext *ctx,
     return CHECK_CU(s->hwctx->internal->cuda_dl->cuMemcpy2DAsync(&cpy, s->cu_stream));
 }
 
-/* ─── Framesync callback — N frames аre ready, compose ────────────────── */
+/* ─── Framesync callback ──────────────────────────────────────────────── */
 
 static int cuda_grid_compose(FFFrameSync *fs)
 {
@@ -95,12 +183,12 @@ static int cuda_grid_compose(FFFrameSync *fs)
     AVFilterLink    *outlink = ctx->outputs[0];
     CudaGridContext *s       = ctx->priv;
     AVFrame *out = NULL;
-    AVFrame *in[CUDA_GRID_INPUTS] = {0};
+    AVFrame *in[MAX_CELLS] = {0};
     CUcontext dummy;
-    int ret;
+    int i, ret;
+    int nb = s->layout->nb_cells;
 
-    /* Сбор всех N input frames из framesync */
-    for (int i = 0; i < CUDA_GRID_INPUTS; i++) {
+    for (i = 0; i < nb; i++) {
         ret = ff_framesync_get_frame(fs, i, &in[i], 0);
         if (ret < 0)
             return ret;
@@ -110,43 +198,38 @@ static int cuda_grid_compose(FFFrameSync *fs)
         }
     }
 
-    /* Output frame из output's hw_frames_pool */
     out = ff_get_video_buffer(outlink, s->out_width, s->out_height);
     if (!out)
         return AVERROR(ENOMEM);
 
-    /* Copy props (timestamps, color metadata) от первого input */
     ret = av_frame_copy_props(out, in[0]);
     if (ret < 0)
         goto fail;
     out->width  = s->out_width;
     out->height = s->out_height;
 
-    /* CUDA context push для всех cuMemcpy в этом filter call */
     ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPushCurrent(s->cu_ctx));
     if (ret < 0)
         goto fail;
 
-    /* Для каждого input — copy Y plane + UV plane в свою quadrant.
-     * NV12 layout: data[0] = Y, data[1] = UV interleaved. linesize[0/1] = pitch. */
-    for (int i = 0; i < CUDA_GRID_INPUTS; i++) {
+    for (i = 0; i < nb; i++) {
         AVFrame *src = in[i];
-        int cx = s->cells[i].x;
-        int cy = s->cells[i].y;
-        int cw = s->cells[i].w;
-        int ch = s->cells[i].h;
+        int cx = s->cell_px[i].x;
+        int cy = s->cell_px[i].y;
+        int cw = s->cell_px[i].w;
+        int ch = s->cell_px[i].h;
 
         if (src->width != cw || src->height != ch) {
             av_log(ctx, AV_LOG_ERROR,
-                   "input %d size %dx%d != expected cell size %dx%d "
-                   "(Phase 1: no scaling, all inputs must match cell size)\n",
+                   "input %d size %dx%d != cell size %dx%d "
+                   "(Phase 2a: no scaling — Phase 2b добавит NPP resize)\n",
                    i, src->width, src->height, cw, ch);
             CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
             ret = AVERROR(EINVAL);
             goto fail;
         }
 
-        /* Y plane (full resolution, 1 byte per pixel) */
+        /* Y plane */
         ret = copy_input_plane(ctx,
                                (CUdeviceptr)src->data[0], src->linesize[0],
                                src->width, src->height,
@@ -157,7 +240,7 @@ static int cuda_grid_compose(FFFrameSync *fs)
             goto fail;
         }
 
-        /* UV plane (half resolution для NV12, но 2 bytes per "pixel" — interleaved UV) */
+        /* UV plane (NV12: половинное разрешение, 2 bytes per "pixel") */
         ret = copy_input_plane(ctx,
                                (CUdeviceptr)src->data[1], src->linesize[1],
                                src->width / 2, src->height / 2,
@@ -170,7 +253,6 @@ static int cuda_grid_compose(FFFrameSync *fs)
     }
 
     CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
-
     return ff_filter_frame(outlink, out);
 
 fail:
@@ -178,14 +260,38 @@ fail:
     return ret;
 }
 
-/* ─── Lifecycle: init / uninit / query_formats / config_input / config_output ─ */
+/* ─── Lifecycle ────────────────────────────────────────────────────────── */
 
 static av_cold int cuda_grid_init(AVFilterContext *ctx)
 {
     CudaGridContext *s = ctx->priv;
-    /* Сами inputs регистрируем в filter struct (см. внизу — нельзя AVFILTER_INPUT_COUNT_MAX
-     * без явных AVFilterPad'ов). Phase 1 fix=4. */
-    (void)s;
+    const LayoutTemplate *lt;
+    int i, ret;
+
+    lt = find_layout(s->layout_name);
+    if (!lt) {
+        av_log(ctx, AV_LOG_ERROR, "unknown layout '%s'. Доступны: ", s->layout_name);
+        for (i = 0; i < (int)FF_ARRAY_ELEMS(layouts); i++)
+            av_log(ctx, AV_LOG_ERROR, "%s ", layouts[i].name);
+        av_log(ctx, AV_LOG_ERROR, "\n");
+        return AVERROR(EINVAL);
+    }
+    s->layout = lt;
+
+    /* Dynamic inputs — append pad per cell */
+    for (i = 0; i < lt->nb_cells; i++) {
+        AVFilterPad pad = { 0 };
+        pad.type = AVMEDIA_TYPE_VIDEO;
+        pad.name = av_asprintf("input%d", i);
+        if (!pad.name)
+            return AVERROR(ENOMEM);
+        ret = ff_append_inpad_free_name(ctx, &pad);
+        if (ret < 0)
+            return ret;
+    }
+
+    av_log(ctx, AV_LOG_INFO, "cuda_grid layout=%s cells=%d output=%dx%d\n",
+           lt->name, lt->nb_cells, s->out_width, s->out_height);
     return 0;
 }
 
@@ -193,7 +299,6 @@ static av_cold void cuda_grid_uninit(AVFilterContext *ctx)
 {
     CudaGridContext *s = ctx->priv;
     ff_framesync_uninit(&s->fs);
-    av_buffer_unref(&s->hw_device_ctx);
 }
 
 static int cuda_grid_config_input(AVFilterLink *inlink)
@@ -220,7 +325,7 @@ static int cuda_grid_config_output(AVFilterLink *outlink)
     AVHWDeviceContext   *hwdev;
     AVBufferRef         *out_ref;
     AVHWFramesContext   *out_hwfc;
-    int W, H, ret;
+    int i, ret;
 
     if (!inl0->hw_frames_ctx)
         return AVERROR(EINVAL);
@@ -228,15 +333,28 @@ static int cuda_grid_config_output(AVFilterLink *outlink)
 
     if (hwfc0->sw_format != AV_PIX_FMT_NV12) {
         av_log(ctx, AV_LOG_ERROR,
-               "Phase 1 supports only NV12, got %s\n",
+               "Phase 1-2a поддерживают только NV12, got %s\n",
                av_get_pix_fmt_name(hwfc0->sw_format));
         return AVERROR(EINVAL);
     }
 
-    /* Все inputs должны иметь одинаковый device и sw_format (Phase 1 also same size) */
-    W = in0->w;
-    H = in0->h;
-    for (int i = 1; i < CUDA_GRID_INPUTS; i++) {
+    /* Compute pixel rects из normalized layout × output size */
+    for (i = 0; i < s->layout->nb_cells; i++) {
+        s->cell_px[i].x = (int)(s->layout->cells[i].x * s->out_width);
+        s->cell_px[i].y = (int)(s->layout->cells[i].y * s->out_height);
+        s->cell_px[i].w = (int)(s->layout->cells[i].w * s->out_width);
+        s->cell_px[i].h = (int)(s->layout->cells[i].h * s->out_height);
+        /* Align до chroma boundary (NV12 → 2x2) */
+        s->cell_px[i].x &= ~1;
+        s->cell_px[i].y &= ~1;
+        s->cell_px[i].w &= ~1;
+        s->cell_px[i].h &= ~1;
+        av_log(ctx, AV_LOG_VERBOSE, "  cell[%d] = %dx%d @ (%d,%d)\n",
+               i, s->cell_px[i].w, s->cell_px[i].h, s->cell_px[i].x, s->cell_px[i].y);
+    }
+
+    /* Validate все inputs: device match + sw_format match */
+    for (i = 1; i < s->layout->nb_cells; i++) {
         AVFilterLink *inN = ctx->inputs[i];
         FilterLink   *ilN = ff_filter_link(inN);
         AVHWFramesContext *hN;
@@ -251,34 +369,16 @@ static int cuda_grid_config_output(AVFilterLink *outlink)
             av_log(ctx, AV_LOG_ERROR, "input %d sw_format mismatch\n", i);
             return AVERROR(EINVAL);
         }
-        if (inN->w != W || inN->h != H) {
-            av_log(ctx, AV_LOG_ERROR,
-                   "Phase 1: input %d size %dx%d != input 0 size %dx%d. "
-                   "В этой фазе scaling не поддерживается, все inputs должны быть одного размера.\n",
-                   i, inN->w, inN->h, W, H);
-            return AVERROR(EINVAL);
-        }
     }
 
-    /* Output = 2W × 2H для quad layout */
-    s->out_width  = 2 * W;
-    s->out_height = 2 * H;
     outlink->w = s->out_width;
     outlink->h = s->out_height;
 
-    /* Hardcoded quad cell positions */
-    s->cells[0].x = 0;     s->cells[0].y = 0;     s->cells[0].w = W; s->cells[0].h = H;
-    s->cells[1].x = W;     s->cells[1].y = 0;     s->cells[1].w = W; s->cells[1].h = H;
-    s->cells[2].x = 0;     s->cells[2].y = H;     s->cells[2].w = W; s->cells[2].h = H;
-    s->cells[3].x = W;     s->cells[3].y = H;     s->cells[3].w = W; s->cells[3].h = H;
-
-    /* Setup CUDA device + stream context из input 0 */
     hwdev = hwfc0->device_ctx;
     s->hwctx    = (AVCUDADeviceContext *)hwdev->hwctx;
     s->cu_ctx   = s->hwctx->cuda_ctx;
     s->cu_stream = s->hwctx->stream;
 
-    /* Аллокация output hw_frames_ctx — copy от input #0 с обновлёнными размерами */
     out_ref = av_hwframe_ctx_alloc(hwfc0->device_ref);
     if (!out_ref)
         return AVERROR(ENOMEM);
@@ -296,14 +396,12 @@ static int cuda_grid_config_output(AVFilterLink *outlink)
     }
     outl->hw_frames_ctx = out_ref;
 
-    /* Setup framesync для lock-step pull от N inputs */
-    ret = ff_framesync_init(&s->fs, ctx, CUDA_GRID_INPUTS);
+    ret = ff_framesync_init(&s->fs, ctx, s->layout->nb_cells);
     if (ret < 0)
         return ret;
     {
-        int i;
         FFFrameSyncIn *fs_in;
-        for (i = 0; i < CUDA_GRID_INPUTS; i++) {
+        for (i = 0; i < s->layout->nb_cells; i++) {
             fs_in = &s->fs.in[i];
             fs_in->time_base = ctx->inputs[i]->time_base;
             fs_in->sync   = 1;
@@ -315,7 +413,6 @@ static int cuda_grid_config_output(AVFilterLink *outlink)
     s->fs.on_event = cuda_grid_compose;
 
     outlink->time_base = ctx->inputs[0]->time_base;
-
     return ff_framesync_configure(&s->fs);
 }
 
@@ -325,10 +422,18 @@ static int cuda_grid_activate(AVFilterContext *ctx)
     return ff_framesync_activate(&s->fs);
 }
 
-/* ─── Filter registration ──────────────────────────────────────────────── */
+/* ─── Options + registration ───────────────────────────────────────────── */
+
+#define OFFSET(x) offsetof(CudaGridContext, x)
+#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
 
 static const AVOption cuda_grid_options[] = {
-    /* Phase 1: no options. Phase 2 добавит `layout=`. */
+    { "layout", "имя layout template",
+      OFFSET(layout_name), AV_OPT_TYPE_STRING, { .str = "quad" }, 0, 0, FLAGS },
+    { "out_w", "ширина output frame в пикселях",
+      OFFSET(out_width),   AV_OPT_TYPE_INT,    { .i64 = 1920 }, 16, 16384, FLAGS },
+    { "out_h", "высота output frame в пикселях",
+      OFFSET(out_height),  AV_OPT_TYPE_INT,    { .i64 = 1080 }, 16, 16384, FLAGS },
     { NULL }
 };
 
@@ -340,13 +445,6 @@ static const AVClass cuda_grid_class = {
     .category   = AV_CLASS_CATEGORY_FILTER,
 };
 
-static const AVFilterPad cuda_grid_inputs[] = {
-    { .name = "input0", .type = AVMEDIA_TYPE_VIDEO, .config_props = cuda_grid_config_input },
-    { .name = "input1", .type = AVMEDIA_TYPE_VIDEO, .config_props = cuda_grid_config_input },
-    { .name = "input2", .type = AVMEDIA_TYPE_VIDEO, .config_props = cuda_grid_config_input },
-    { .name = "input3", .type = AVMEDIA_TYPE_VIDEO, .config_props = cuda_grid_config_input },
-};
-
 static const AVFilterPad cuda_grid_outputs[] = {
     { .name = "default", .type = AVMEDIA_TYPE_VIDEO, .config_props = cuda_grid_config_output },
 };
@@ -359,9 +457,9 @@ const AVFilter ff_vf_cuda_grid = {
     .init           = cuda_grid_init,
     .uninit         = cuda_grid_uninit,
     .activate       = cuda_grid_activate,
-    FILTER_INPUTS(cuda_grid_inputs),
+    /* No FILTER_INPUTS — pads added dynamically в init() per layout. */
     FILTER_OUTPUTS(cuda_grid_outputs),
     FILTER_SINGLE_PIXFMT(AV_PIX_FMT_CUDA),
-    .flags          = AVFILTER_FLAG_HWDEVICE,
+    .flags          = AVFILTER_FLAG_HWDEVICE | AVFILTER_FLAG_DYNAMIC_INPUTS,
     .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
 };
-- 
2.52.0


From df476472e2e311f06e81c0669f10fca7ceb97e29 Mon Sep 17 00:00:00 2001
From: gx <gx@goldix.org>
Date: Tue, 19 May 2026 20:58:29 +0100
Subject: [PATCH 04/14] =?UTF-8?q?vf=5Fcuda=5Fgrid:=20fix=20include=20avstr?=
 =?UTF-8?q?ing.h=20=D0=B4=D0=BB=D1=8F=20av=5Fasprintf?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 libavfilter/vf_cuda_grid.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libavfilter/vf_cuda_grid.c b/libavfilter/vf_cuda_grid.c
index 9537b54..bfc4a53 100644
--- a/libavfilter/vf_cuda_grid.c
+++ b/libavfilter/vf_cuda_grid.c
@@ -19,6 +19,7 @@
 
 #include "config_components.h"
 
+#include "libavutil/avstring.h"
 #include "libavutil/common.h"
 #include "libavutil/cuda_check.h"
 #include "libavutil/hwcontext.h"
-- 
2.52.0


From 11f310061a3ac0290b790c565d8d4cb86cabdbf5 Mon Sep 17 00:00:00 2001
From: gx <gx@goldix.org>
Date: Tue, 19 May 2026 21:20:04 +0100
Subject: [PATCH 05/14] =?UTF-8?q?vf=5Fcuda=5Fgrid:=20Phase=202b=20?=
 =?UTF-8?q?=E2=80=94=20NPP=20scaling=20=D0=B4=D0=BB=D1=8F=20mixed-size=20i?=
 =?UTF-8?q?nputs?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add libnpp dependency в configure (cuda_grid_filter_deps)
- #include <nppi.h>, nppSetStream(s->cu_stream) перед resize batch
- Smart copy-or-scale в compose path:
  - src.size == cell.size → cuMemcpy2DAsync (fast path, zero overhead)
  - else → nppiResizeSqrPixel_8u_C1R для Y + _C2R для NV12 UV interleaved
- NPPI_INTER_LINEAR interpolation (bilinear — стандартный для video)
- Destination pointer offset через explicit pointer arithmetic
  (NPP не имеет dst_offset параметра, нужно сместить pSrc указатель)

Это unblock'ает mixed-size cameras (parking 1920x1080 + gate_lpr 2688x1520
в одном grid с main_plus_preview layout — big cell scaled до 1280x1080,
small cells scaled до 640x360).

Phase 2 complete (2a + 2b). Phase 3 будет controller sidecar.
---
 configure                  |  2 +-
 libavfilter/vf_cuda_grid.c | 96 +++++++++++++++++++++++++++-----------
 2 files changed, 69 insertions(+), 29 deletions(-)

diff --git a/configure b/configure
index 9c60cb7..cc6a635 100755
--- a/configure
+++ b/configure
@@ -3317,7 +3317,7 @@ thumbnail_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 transpose_npp_filter_deps="ffnvcodec libnpp"
 overlay_cuda_filter_deps="ffnvcodec"
 overlay_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
-cuda_grid_filter_deps="ffnvcodec"
+cuda_grid_filter_deps="ffnvcodec libnpp"
 sharpen_npp_filter_deps="ffnvcodec libnpp"
 
 ddagrab_filter_deps="d3d11va IDXGIOutput1 DXGI_OUTDUPL_FRAME_INFO"
diff --git a/libavfilter/vf_cuda_grid.c b/libavfilter/vf_cuda_grid.c
index bfc4a53..cfad54b 100644
--- a/libavfilter/vf_cuda_grid.c
+++ b/libavfilter/vf_cuda_grid.c
@@ -19,6 +19,8 @@
 
 #include "config_components.h"
 
+#include <nppi.h>
+
 #include "libavutil/avstring.h"
 #include "libavutil/common.h"
 #include "libavutil/cuda_check.h"
@@ -213,6 +215,9 @@ static int cuda_grid_compose(FFFrameSync *fs)
     if (ret < 0)
         goto fail;
 
+    /* NPP в этом thread'е работает в нашем CUDA stream */
+    nppSetStream(s->cu_stream);
+
     for (i = 0; i < nb; i++) {
         AVFrame *src = in[i];
         int cx = s->cell_px[i].x;
@@ -220,36 +225,71 @@ static int cuda_grid_compose(FFFrameSync *fs)
         int cw = s->cell_px[i].w;
         int ch = s->cell_px[i].h;
 
-        if (src->width != cw || src->height != ch) {
-            av_log(ctx, AV_LOG_ERROR,
-                   "input %d size %dx%d != cell size %dx%d "
-                   "(Phase 2a: no scaling — Phase 2b добавит NPP resize)\n",
-                   i, src->width, src->height, cw, ch);
-            CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
-            ret = AVERROR(EINVAL);
-            goto fail;
-        }
+        if (src->width == cw && src->height == ch) {
+            /* Fast path: same size — memcpy без NPP overhead */
+            ret = copy_input_plane(ctx,
+                                   (CUdeviceptr)src->data[0], src->linesize[0],
+                                   src->width, src->height,
+                                   (CUdeviceptr)out->data[0], out->linesize[0],
+                                   cx, cy, 1);
+            if (ret < 0) {
+                CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
+                goto fail;
+            }
+            ret = copy_input_plane(ctx,
+                                   (CUdeviceptr)src->data[1], src->linesize[1],
+                                   src->width / 2, src->height / 2,
+                                   (CUdeviceptr)out->data[1], out->linesize[1],
+                                   cx / 2, cy / 2, 2);
+            if (ret < 0) {
+                CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
+                goto fail;
+            }
+        } else {
+            /* Phase 2b: NPP scaling. Output обязательно chroma-aligned (cell coords
+             * выровнены до 2 в config_output). */
+            NppStatus npp_err;
+            double xfactor_y  = (double)cw       / src->width;
+            double yfactor_y  = (double)ch       / src->height;
+            double xfactor_uv = (double)(cw / 2) / (src->width  / 2);
+            double yfactor_uv = (double)(ch / 2) / (src->height / 2);
 
-        /* Y plane */
-        ret = copy_input_plane(ctx,
-                               (CUdeviceptr)src->data[0], src->linesize[0],
-                               src->width, src->height,
-                               (CUdeviceptr)out->data[0], out->linesize[0],
-                               cx, cy, 1);
-        if (ret < 0) {
-            CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
-            goto fail;
-        }
+            uint8_t *dst_y_ptr  = out->data[0] + (size_t)cy       * out->linesize[0] + cx;
+            uint8_t *dst_uv_ptr = out->data[1] + (size_t)(cy / 2) * out->linesize[1] + cx;
+            /* dst_uv X в bytes = cx (2 bytes per UV-pair × cx/2 = cx bytes) */
 
-        /* UV plane (NV12: половинное разрешение, 2 bytes per "pixel") */
-        ret = copy_input_plane(ctx,
-                               (CUdeviceptr)src->data[1], src->linesize[1],
-                               src->width / 2, src->height / 2,
-                               (CUdeviceptr)out->data[1], out->linesize[1],
-                               cx / 2, cy / 2, 2);
-        if (ret < 0) {
-            CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
-            goto fail;
+            /* Y plane — 1 channel (luma) */
+            npp_err = nppiResizeSqrPixel_8u_C1R(
+                src->data[0], (NppiSize){src->width, src->height},
+                src->linesize[0], (NppiRect){0, 0, src->width, src->height},
+                dst_y_ptr,    out->linesize[0],
+                (NppiRect){0, 0, cw, ch},
+                xfactor_y, yfactor_y, 0.0, 0.0,
+                NPPI_INTER_LINEAR);
+            if (npp_err != NPP_SUCCESS) {
+                av_log(ctx, AV_LOG_ERROR,
+                       "input %d Y plane NPP resize %dx%d→%dx%d failed: %d\n",
+                       i, src->width, src->height, cw, ch, npp_err);
+                CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
+                ret = AVERROR_EXTERNAL;
+                goto fail;
+            }
+
+            /* UV plane — 2 channels interleaved (NV12 chroma) */
+            npp_err = nppiResizeSqrPixel_8u_C2R(
+                src->data[1], (NppiSize){src->width / 2, src->height / 2},
+                src->linesize[1], (NppiRect){0, 0, src->width / 2, src->height / 2},
+                dst_uv_ptr,   out->linesize[1],
+                (NppiRect){0, 0, cw / 2, ch / 2},
+                xfactor_uv, yfactor_uv, 0.0, 0.0,
+                NPPI_INTER_LINEAR);
+            if (npp_err != NPP_SUCCESS) {
+                av_log(ctx, AV_LOG_ERROR,
+                       "input %d UV plane NPP resize failed: %d\n", i, npp_err);
+                CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
+                ret = AVERROR_EXTERNAL;
+                goto fail;
+            }
         }
     }
 
-- 
2.52.0


From 178fc5bb4eb53a925256ecba5041fff0744e5f3a Mon Sep 17 00:00:00 2001
From: gx <gx@goldix.org>
Date: Tue, 19 May 2026 21:45:40 +0100
Subject: [PATCH 06/14] =?UTF-8?q?vf=5Fcuda=5Fgrid:=20Phase=202b=20?=
 =?UTF-8?q?=E2=80=94=20delegated=20scaling=20to=20upstream=20scale=5Fnpp?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

После попытки in-filter NPP scaling обнаружено что nppiResize не имеет
_C2R variant для NV12 UV interleaved (только C1R, C3R, C4R). Alternatives:
- 2× nppiResize_8u_C1R с split/merge через intermediate buffers
- custom CUDA kernel
- treat UV pair как 16u (blending artifact на boundaries)

Pragmatic decision: cuda_grid делает только composition (same-size memcpy),
а scaling делегируется существующему scale_npp filter в filter chain:

  [0]scale_npp=1280:1080[s0]; [1]scale_npp=640:360[s1]; ... \
  [s0][s1]...cuda_grid=layout=main_plus_preview

Unix philosophy + leverages production-tested NPP code. Controller (Phase 3)
auto-generates filter graph с scale_npp per input.

Revert:
- #include <nppi.h>
- libnpp dependency в configure (cuda_grid_filter_deps="ffnvcodec")
- nppiResize* calls в compose path

Add:
- Error message с примером scale_npp chain pattern
- Doc в file header c filter graph пример

Phase 2 = full deliverable (2a + 2b). Дальше Phase 3 controller.
---
 configure                  |   2 +-
 libavfilter/vf_cuda_grid.c | 116 ++++++++++++++-----------------------
 2 files changed, 46 insertions(+), 72 deletions(-)

diff --git a/configure b/configure
index cc6a635..9c60cb7 100755
--- a/configure
+++ b/configure
@@ -3317,7 +3317,7 @@ thumbnail_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 transpose_npp_filter_deps="ffnvcodec libnpp"
 overlay_cuda_filter_deps="ffnvcodec"
 overlay_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
-cuda_grid_filter_deps="ffnvcodec libnpp"
+cuda_grid_filter_deps="ffnvcodec"
 sharpen_npp_filter_deps="ffnvcodec libnpp"
 
 ddagrab_filter_deps="d3d11va IDXGIOutput1 DXGI_OUTDUPL_FRAME_INFO"
diff --git a/libavfilter/vf_cuda_grid.c b/libavfilter/vf_cuda_grid.c
index cfad54b..eca57f0 100644
--- a/libavfilter/vf_cuda_grid.c
+++ b/libavfilter/vf_cuda_grid.c
@@ -4,13 +4,25 @@
  * Принимает N CUDA-frames на входе, выдаёт один composed frame с N cells
  * в layout. End-to-end CUDA (без CPU round-trip).
  *
- * Phase 2a: layout templates (single/dual_h/dual_v/quad/main_plus_preview/
+ * Phase 2: layout templates (single/dual_h/dual_v/quad/main_plus_preview/
  * six_grid/nine_grid/sixteen_grid/panoramic), dynamic nb_inputs, output size
  * через option (default 1920×1080). Cell rects = normalized × output size.
- * **Scaling пока нет** — каждый input должен быть точно cell size (Phase 2b NPP).
+ *
+ * **Scaling delegated to upstream `scale_npp`** filter (Unix philosophy +
+ * production-tested NPP code). NPP не имеет nppiResize_8u_C2R для NV12 UV
+ * interleaved, поэтому in-filter scaling = either two intermediate plane
+ * buffers либо custom CUDA kernel — оба больше work чем filter chain'ить:
+ *
+ *   ffmpeg ... -filter_complex \
+ *     "[0]scale_npp=1280:1080[s0]; \
+ *      [1]scale_npp=640:360[s1]; \
+ *      [2]scale_npp=640:360[s2]; \
+ *      [3]scale_npp=640:360[s3]; \
+ *      [s0][s1][s2][s3]cuda_grid=layout=main_plus_preview[out]"
+ *
+ * Controller (Phase 3) auto-generates filter graph с scale_npp per input.
  *
  * Future phases (см. gx/vf-cuda-grid#1):
- *  - Phase 2b: per-cell scaling через libnpp (mixed-size inputs)
  *  - Phase 3: runtime layout switching через process_command (ZMQ)
  *  - Phase 4+: overlay primitives (rect/text/icon/image/dim/graph/chat)
  *
@@ -19,8 +31,6 @@
 
 #include "config_components.h"
 
-#include <nppi.h>
-
 #include "libavutil/avstring.h"
 #include "libavutil/common.h"
 #include "libavutil/cuda_check.h"
@@ -215,9 +225,6 @@ static int cuda_grid_compose(FFFrameSync *fs)
     if (ret < 0)
         goto fail;
 
-    /* NPP в этом thread'е работает в нашем CUDA stream */
-    nppSetStream(s->cu_stream);
-
     for (i = 0; i < nb; i++) {
         AVFrame *src = in[i];
         int cx = s->cell_px[i].x;
@@ -225,71 +232,38 @@ static int cuda_grid_compose(FFFrameSync *fs)
         int cw = s->cell_px[i].w;
         int ch = s->cell_px[i].h;
 
-        if (src->width == cw && src->height == ch) {
-            /* Fast path: same size — memcpy без NPP overhead */
-            ret = copy_input_plane(ctx,
-                                   (CUdeviceptr)src->data[0], src->linesize[0],
-                                   src->width, src->height,
-                                   (CUdeviceptr)out->data[0], out->linesize[0],
-                                   cx, cy, 1);
-            if (ret < 0) {
-                CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
-                goto fail;
-            }
-            ret = copy_input_plane(ctx,
-                                   (CUdeviceptr)src->data[1], src->linesize[1],
-                                   src->width / 2, src->height / 2,
-                                   (CUdeviceptr)out->data[1], out->linesize[1],
-                                   cx / 2, cy / 2, 2);
-            if (ret < 0) {
-                CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
-                goto fail;
-            }
-        } else {
-            /* Phase 2b: NPP scaling. Output обязательно chroma-aligned (cell coords
-             * выровнены до 2 в config_output). */
-            NppStatus npp_err;
-            double xfactor_y  = (double)cw       / src->width;
-            double yfactor_y  = (double)ch       / src->height;
-            double xfactor_uv = (double)(cw / 2) / (src->width  / 2);
-            double yfactor_uv = (double)(ch / 2) / (src->height / 2);
+        if (src->width != cw || src->height != ch) {
+            av_log(ctx, AV_LOG_ERROR,
+                   "input %d size %dx%d != cell size %dx%d. "
+                   "cuda_grid не делает scaling — используй upstream scale_npp:\n"
+                   "  [in%d]scale_npp=%d:%d[scaled%d]; [scaled%d]...cuda_grid=layout=%s\n",
+                   i, src->width, src->height, cw, ch,
+                   i, cw, ch, i, i, s->layout->name);
+            CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
+            ret = AVERROR(EINVAL);
+            goto fail;
+        }
 
-            uint8_t *dst_y_ptr  = out->data[0] + (size_t)cy       * out->linesize[0] + cx;
-            uint8_t *dst_uv_ptr = out->data[1] + (size_t)(cy / 2) * out->linesize[1] + cx;
-            /* dst_uv X в bytes = cx (2 bytes per UV-pair × cx/2 = cx bytes) */
+        /* Y plane (1 byte per pixel) */
+        ret = copy_input_plane(ctx,
+                               (CUdeviceptr)src->data[0], src->linesize[0],
+                               src->width, src->height,
+                               (CUdeviceptr)out->data[0], out->linesize[0],
+                               cx, cy, 1);
+        if (ret < 0) {
+            CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
+            goto fail;
+        }
 
-            /* Y plane — 1 channel (luma) */
-            npp_err = nppiResizeSqrPixel_8u_C1R(
-                src->data[0], (NppiSize){src->width, src->height},
-                src->linesize[0], (NppiRect){0, 0, src->width, src->height},
-                dst_y_ptr,    out->linesize[0],
-                (NppiRect){0, 0, cw, ch},
-                xfactor_y, yfactor_y, 0.0, 0.0,
-                NPPI_INTER_LINEAR);
-            if (npp_err != NPP_SUCCESS) {
-                av_log(ctx, AV_LOG_ERROR,
-                       "input %d Y plane NPP resize %dx%d→%dx%d failed: %d\n",
-                       i, src->width, src->height, cw, ch, npp_err);
-                CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
-                ret = AVERROR_EXTERNAL;
-                goto fail;
-            }
-
-            /* UV plane — 2 channels interleaved (NV12 chroma) */
-            npp_err = nppiResizeSqrPixel_8u_C2R(
-                src->data[1], (NppiSize){src->width / 2, src->height / 2},
-                src->linesize[1], (NppiRect){0, 0, src->width / 2, src->height / 2},
-                dst_uv_ptr,   out->linesize[1],
-                (NppiRect){0, 0, cw / 2, ch / 2},
-                xfactor_uv, yfactor_uv, 0.0, 0.0,
-                NPPI_INTER_LINEAR);
-            if (npp_err != NPP_SUCCESS) {
-                av_log(ctx, AV_LOG_ERROR,
-                       "input %d UV plane NPP resize failed: %d\n", i, npp_err);
-                CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
-                ret = AVERROR_EXTERNAL;
-                goto fail;
-            }
+        /* UV plane (NV12: half resolution, 2 bytes per "pixel") */
+        ret = copy_input_plane(ctx,
+                               (CUdeviceptr)src->data[1], src->linesize[1],
+                               src->width / 2, src->height / 2,
+                               (CUdeviceptr)out->data[1], out->linesize[1],
+                               cx / 2, cy / 2, 2);
+        if (ret < 0) {
+            CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
+            goto fail;
         }
     }
 
-- 
2.52.0


From 9deaca769710c88d227316f615e241d73ec2f483 Mon Sep 17 00:00:00 2001
From: gx <gx@goldix.org>
Date: Tue, 19 May 2026 22:17:41 +0100
Subject: [PATCH 07/14] =?UTF-8?q?vf=5Fcuda=5Fgrid:=20Phase=204b-1=20?=
 =?UTF-8?q?=E2=80=94=20rect=20overlay=20primitives=20(solid=20fill,=20no?=
 =?UTF-8?q?=20alpha)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Добавляет inner overlay state с mutex + process_command handler.
Rendering filled/border rects через cuMemsetD2D8Async/D2D16Async — без
custom kernel'а (Phase 4b-2 = alpha blend, требует .cu).

Commands:
  add_overlay    <id> rect cell=N x=.. y=.. w=.. h=.. r=.. g=.. b=.. thickness=.. opacity=..
  remove_overlay <id>
  clear_overlays

text/icon/dim — типы определены, render заглушен до Phase 4b-2/3/4.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 libavfilter/vf_cuda_grid.c | 402 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 391 insertions(+), 11 deletions(-)

diff --git a/libavfilter/vf_cuda_grid.c b/libavfilter/vf_cuda_grid.c
index eca57f0..9f5cc51 100644
--- a/libavfilter/vf_cuda_grid.c
+++ b/libavfilter/vf_cuda_grid.c
@@ -40,6 +40,7 @@
 #include "libavutil/mem.h"
 #include "libavutil/opt.h"
 #include "libavutil/pixdesc.h"
+#include "libavutil/thread.h"
 
 #include "avfilter.h"
 #include "filters.h"
@@ -48,7 +49,46 @@
 #include "video.h"
 
 #define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x)
-#define MAX_CELLS 16
+#define MAX_CELLS    16
+#define MAX_OVERLAYS 64
+#define OVERLAY_ID_MAX 32
+#define OVERLAY_TEXT_MAX 128
+
+/* ─── Overlay primitives (Phase 4b-1: rect only, no alpha) ─────────────── */
+
+typedef enum {
+    OV_TYPE_RECT = 0,
+    OV_TYPE_TEXT,   /* Phase 4b-3 */
+    OV_TYPE_ICON,   /* Phase 4b-4 */
+    OV_TYPE_DIM,    /* Phase 4b-2 (alpha kernel) */
+} GridOverlayType;
+
+typedef struct GridOverlay {
+    char            id[OVERLAY_ID_MAX];
+    GridOverlayType type;
+    int             cell;       /* -1 = absolute on output, иначе cell-relative */
+    float           x, y, w, h; /* [0..1] normalized; relative к cell или output */
+    int             z_order;    /* меньше → рисуется первым */
+    uint8_t         opacity;    /* 0..255, реально применяется в Phase 4b-2 */
+    int             visible;
+    union {
+        struct {
+            uint8_t r, g, b;
+            int     thickness;  /* 0 = filled, >0 = border (border via 4 strips) */
+        } rect;
+        struct {
+            char    text[OVERLAY_TEXT_MAX];
+            int     font_size;
+            uint8_t r, g, b;
+        } text;
+        struct {
+            char    icon_name[32];
+        } icon;
+        struct {
+            uint8_t amount; /* 0..255, alpha for dim */
+        } dim;
+    } u;
+} GridOverlay;
 
 /* ─── Layout templates (normalized координаты 0.0–1.0) ─────────────────── */
 
@@ -157,6 +197,12 @@ typedef struct CudaGridContext {
     struct {
         int x, y, w, h;
     } cell_px[MAX_CELLS];
+
+    /* Overlay state (Phase 4b) — mutex-guarded, process_command/render thread-safe */
+    GridOverlay     overlays[MAX_OVERLAYS];
+    int             nb_overlays;
+    pthread_mutex_t overlay_lock;
+    int             overlay_lock_inited;
 } CudaGridContext;
 
 /* ─── Composition: copy одного input plane в target region output ──────── */
@@ -188,6 +234,272 @@ static int copy_input_plane(AVFilterContext *ctx,
     return CHECK_CU(s->hwctx->internal->cuda_dl->cuMemcpy2DAsync(&cpy, s->cu_stream));
 }
 
+/* ─── Overlay parsing + rendering (Phase 4b-1: rect, solid fill, no alpha) ─ */
+
+/* BT.709 limited-range RGB → YUV. Достаточно для HDTV (любое 1920×1080 output). */
+static av_always_inline void rgb_to_yuv709(uint8_t r, uint8_t g, uint8_t b,
+                                            uint8_t *Y, uint8_t *U, uint8_t *V)
+{
+    int y = (int)( 0.183f * r + 0.614f * g + 0.062f * b) + 16;
+    int u = (int)(-0.101f * r - 0.339f * g + 0.439f * b) + 128;
+    int v = (int)( 0.439f * r - 0.399f * g - 0.040f * b) + 128;
+    *Y = av_clip_uint8(y);
+    *U = av_clip_uint8(u);
+    *V = av_clip_uint8(v);
+}
+
+/* Parse args в формате: "<id> <type> <key>=<val> <key>=<val> ..."
+ * Out *ov заполняется defaults + parsed values. Returns 0 / AVERROR. */
+static int parse_overlay_args(AVFilterContext *ctx, const char *args, GridOverlay *ov)
+{
+    char id[OVERLAY_ID_MAX], type_str[16];
+    const char *p;
+
+    if (!args)
+        return AVERROR(EINVAL);
+
+    if (sscanf(args, "%31s %15s", id, type_str) != 2) {
+        av_log(ctx, AV_LOG_ERROR, "overlay args: expected '<id> <type> ...', got: %s\n", args);
+        return AVERROR(EINVAL);
+    }
+
+    memset(ov, 0, sizeof(*ov));
+    av_strlcpy(ov->id, id, sizeof(ov->id));
+
+    if      (!strcmp(type_str, "rect")) ov->type = OV_TYPE_RECT;
+    else if (!strcmp(type_str, "text")) ov->type = OV_TYPE_TEXT;
+    else if (!strcmp(type_str, "icon")) ov->type = OV_TYPE_ICON;
+    else if (!strcmp(type_str, "dim"))  ov->type = OV_TYPE_DIM;
+    else {
+        av_log(ctx, AV_LOG_ERROR, "unknown overlay type: %s\n", type_str);
+        return AVERROR(EINVAL);
+    }
+
+    /* Defaults */
+    ov->cell    = -1;
+    ov->opacity = 255;
+    ov->visible = 1;
+    if (ov->type == OV_TYPE_RECT)
+        ov->u.rect.thickness = 0;  /* filled */
+    if (ov->type == OV_TYPE_DIM)
+        ov->u.dim.amount = 128;
+
+    /* Advance past <id> и <type> */
+    p = strchr(args, ' ');
+    if (!p) return 0;
+    while (*p == ' ') p++;
+    p = strchr(p, ' ');
+    if (!p) return 0;
+    while (*p == ' ') p++;
+
+    while (*p) {
+        char key[32], val[OVERLAY_TEXT_MAX];
+        int n = 0;
+        if (sscanf(p, "%31[^= \t]=%127s%n", key, val, &n) < 2)
+            break;
+        p += n;
+        while (*p == ' ' || *p == '\t') p++;
+
+        if      (!strcmp(key, "cell"))    ov->cell    = atoi(val);
+        else if (!strcmp(key, "x"))       ov->x       = (float)atof(val);
+        else if (!strcmp(key, "y"))       ov->y       = (float)atof(val);
+        else if (!strcmp(key, "w"))       ov->w       = (float)atof(val);
+        else if (!strcmp(key, "h"))       ov->h       = (float)atof(val);
+        else if (!strcmp(key, "z_order") || !strcmp(key, "z"))
+                                          ov->z_order = atoi(val);
+        else if (!strcmp(key, "opacity")) ov->opacity = av_clip(atoi(val), 0, 255);
+        else if (!strcmp(key, "visible")) ov->visible = atoi(val) ? 1 : 0;
+        else if (ov->type == OV_TYPE_RECT) {
+            if      (!strcmp(key, "r"))         ov->u.rect.r = av_clip_uint8(atoi(val));
+            else if (!strcmp(key, "g"))         ov->u.rect.g = av_clip_uint8(atoi(val));
+            else if (!strcmp(key, "b"))         ov->u.rect.b = av_clip_uint8(atoi(val));
+            else if (!strcmp(key, "thickness")) ov->u.rect.thickness = atoi(val);
+        } else if (ov->type == OV_TYPE_TEXT) {
+            if      (!strcmp(key, "text"))      av_strlcpy(ov->u.text.text, val, sizeof(ov->u.text.text));
+            else if (!strcmp(key, "font_size")) ov->u.text.font_size = atoi(val);
+            else if (!strcmp(key, "r"))         ov->u.text.r = av_clip_uint8(atoi(val));
+            else if (!strcmp(key, "g"))         ov->u.text.g = av_clip_uint8(atoi(val));
+            else if (!strcmp(key, "b"))         ov->u.text.b = av_clip_uint8(atoi(val));
+        } else if (ov->type == OV_TYPE_ICON) {
+            if (!strcmp(key, "icon_name")) av_strlcpy(ov->u.icon.icon_name, val, sizeof(ov->u.icon.icon_name));
+        } else if (ov->type == OV_TYPE_DIM) {
+            if (!strcmp(key, "amount")) ov->u.dim.amount = av_clip_uint8(atoi(val));
+        }
+    }
+    return 0;
+}
+
+/* Lock must be held by caller. */
+static int overlay_upsert_locked(CudaGridContext *s, const GridOverlay *ov)
+{
+    int i;
+    for (i = 0; i < s->nb_overlays; i++) {
+        if (!strcmp(s->overlays[i].id, ov->id)) {
+            s->overlays[i] = *ov;
+            return 0;
+        }
+    }
+    if (s->nb_overlays >= MAX_OVERLAYS)
+        return AVERROR(ENOSPC);
+    s->overlays[s->nb_overlays++] = *ov;
+    return 0;
+}
+
+static int overlay_remove_locked(CudaGridContext *s, const char *id)
+{
+    int i;
+    for (i = 0; i < s->nb_overlays; i++) {
+        if (!strcmp(s->overlays[i].id, id)) {
+            memmove(&s->overlays[i], &s->overlays[i + 1],
+                    (s->nb_overlays - i - 1) * sizeof(GridOverlay));
+            s->nb_overlays--;
+            return 0;
+        }
+    }
+    return AVERROR(ENOENT);
+}
+
+/* Compute pixel rect для overlay: либо cell-relative, либо absolute.
+ * Clips против output bounds + 2px alignment для NV12 chroma. */
+static int overlay_pixel_rect(CudaGridContext *s, const GridOverlay *ov,
+                              int *out_x, int *out_y, int *out_w, int *out_h)
+{
+    int rx, ry, rw, rh;
+    int base_x, base_y, base_w, base_h;
+
+    if (ov->cell < 0) {
+        base_x = 0;
+        base_y = 0;
+        base_w = s->out_width;
+        base_h = s->out_height;
+    } else if (ov->cell < s->layout->nb_cells) {
+        base_x = s->cell_px[ov->cell].x;
+        base_y = s->cell_px[ov->cell].y;
+        base_w = s->cell_px[ov->cell].w;
+        base_h = s->cell_px[ov->cell].h;
+    } else {
+        return AVERROR(EINVAL);
+    }
+
+    rx = base_x + (int)(ov->x * base_w);
+    ry = base_y + (int)(ov->y * base_h);
+    rw = (int)(ov->w * base_w);
+    rh = (int)(ov->h * base_h);
+
+    if (rx < 0)             { rw += rx; rx = 0; }
+    if (ry < 0)             { rh += ry; ry = 0; }
+    if (rx + rw > s->out_width)  rw = s->out_width  - rx;
+    if (ry + rh > s->out_height) rh = s->out_height - ry;
+    rx &= ~1; ry &= ~1; rw &= ~1; rh &= ~1;
+
+    *out_x = rx; *out_y = ry; *out_w = rw; *out_h = rh;
+    return (rw > 0 && rh > 0) ? 0 : 1;  /* 1 → empty, skip */
+}
+
+/* Solid filled strip (no alpha). cu ctx must be pushed. */
+static int render_strip_solid(AVFilterContext *ctx, AVFrame *out,
+                              int x, int y, int w, int h,
+                              uint8_t Y, uint8_t U, uint8_t V)
+{
+    CudaGridContext *s = ctx->priv;
+    CUdeviceptr dst_y, dst_uv;
+    unsigned short uv;
+    int ret;
+
+    x &= ~1; y &= ~1; w &= ~1; h &= ~1;
+    if (w <= 0 || h <= 0) return 0;
+
+    dst_y = (CUdeviceptr)(out->data[0] + (size_t)y * out->linesize[0] + x);
+    ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuMemsetD2D8Async(
+        dst_y, out->linesize[0], Y, (size_t)w, (size_t)h, s->cu_stream));
+    if (ret < 0) return ret;
+
+    /* NV12 UV interleaved, half res. cuMemsetD2D16Async sets 16-bit element
+     * = UV pair. Little-endian: low byte=U, high byte=V. */
+    uv = (unsigned short)U | ((unsigned short)V << 8);
+    dst_uv = (CUdeviceptr)(out->data[1] + (size_t)(y / 2) * out->linesize[1] + x);
+    ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuMemsetD2D16Async(
+        dst_uv, out->linesize[1], uv, (size_t)(w / 2), (size_t)(h / 2), s->cu_stream));
+    return ret;
+}
+
+static int render_overlay_rect(AVFilterContext *ctx, AVFrame *out, const GridOverlay *ov)
+{
+    CudaGridContext *s = ctx->priv;
+    int px, py, pw, ph;
+    uint8_t Y, U, V;
+    int ret;
+
+    ret = overlay_pixel_rect(s, ov, &px, &py, &pw, &ph);
+    if (ret != 0) return ret < 0 ? ret : 0;
+
+    rgb_to_yuv709(ov->u.rect.r, ov->u.rect.g, ov->u.rect.b, &Y, &U, &V);
+
+    if (ov->u.rect.thickness <= 0) {
+        /* Filled */
+        return render_strip_solid(ctx, out, px, py, pw, ph, Y, U, V);
+    } else {
+        int t = ov->u.rect.thickness;
+        t = FFMIN(t, FFMIN(pw / 2, ph / 2));
+        if (t < 2) t = 2;
+        t &= ~1;
+        /* Top */
+        ret = render_strip_solid(ctx, out, px, py, pw, t, Y, U, V);
+        if (ret < 0) return ret;
+        /* Bottom */
+        ret = render_strip_solid(ctx, out, px, py + ph - t, pw, t, Y, U, V);
+        if (ret < 0) return ret;
+        /* Left */
+        ret = render_strip_solid(ctx, out, px, py + t, t, ph - 2 * t, Y, U, V);
+        if (ret < 0) return ret;
+        /* Right */
+        ret = render_strip_solid(ctx, out, px + pw - t, py + t, t, ph - 2 * t, Y, U, V);
+        return ret;
+    }
+}
+
+/* Render all visible overlays. cu ctx must be pushed by caller. */
+static int render_overlays(AVFilterContext *ctx, AVFrame *out)
+{
+    CudaGridContext *s = ctx->priv;
+    GridOverlay sorted[MAX_OVERLAYS];
+    int i, j, n, ret;
+
+    pthread_mutex_lock(&s->overlay_lock);
+    n = s->nb_overlays;
+    memcpy(sorted, s->overlays, (size_t)n * sizeof(GridOverlay));
+    pthread_mutex_unlock(&s->overlay_lock);
+
+    /* Insertion sort by z_order (stable). n ≤ 64 → fine. */
+    for (i = 1; i < n; i++) {
+        GridOverlay tmp = sorted[i];
+        for (j = i; j > 0 && sorted[j - 1].z_order > tmp.z_order; j--)
+            sorted[j] = sorted[j - 1];
+        sorted[j] = tmp;
+    }
+
+    for (i = 0; i < n; i++) {
+        const GridOverlay *ov = &sorted[i];
+        if (!ov->visible) continue;
+        switch (ov->type) {
+        case OV_TYPE_RECT:
+            ret = render_overlay_rect(ctx, out, ov);
+            if (ret < 0) return ret;
+            break;
+        case OV_TYPE_DIM:
+            av_log(ctx, AV_LOG_TRACE, "overlay %s: dim type — Phase 4b-2 (alpha kernel)\n", ov->id);
+            break;
+        case OV_TYPE_TEXT:
+            av_log(ctx, AV_LOG_TRACE, "overlay %s: text — Phase 4b-3 (freetype)\n", ov->id);
+            break;
+        case OV_TYPE_ICON:
+            av_log(ctx, AV_LOG_TRACE, "overlay %s: icon — Phase 4b-4 (sprite)\n", ov->id);
+            break;
+        }
+    }
+    return 0;
+}
+
 /* ─── Framesync callback ──────────────────────────────────────────────── */
 
 static int cuda_grid_compose(FFFrameSync *fs)
@@ -267,6 +579,13 @@ static int cuda_grid_compose(FFFrameSync *fs)
         }
     }
 
+    /* Overlay pass (Phase 4b) */
+    ret = render_overlays(ctx, out);
+    if (ret < 0) {
+        CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
+        goto fail;
+    }
+
     CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
     return ff_filter_frame(outlink, out);
 
@@ -305,7 +624,15 @@ static av_cold int cuda_grid_init(AVFilterContext *ctx)
             return ret;
     }
 
-    av_log(ctx, AV_LOG_INFO, "cuda_grid layout=%s cells=%d output=%dx%d\n",
+    /* Overlay mutex */
+    ret = pthread_mutex_init(&s->overlay_lock, NULL);
+    if (ret) {
+        av_log(ctx, AV_LOG_ERROR, "overlay_lock init failed: %d\n", ret);
+        return AVERROR(ENOMEM);
+    }
+    s->overlay_lock_inited = 1;
+
+    av_log(ctx, AV_LOG_INFO, "cuda_grid layout=%s cells=%d output=%dx%d (overlays: rect)\n",
            lt->name, lt->nb_cells, s->out_width, s->out_height);
     return 0;
 }
@@ -314,6 +641,58 @@ static av_cold void cuda_grid_uninit(AVFilterContext *ctx)
 {
     CudaGridContext *s = ctx->priv;
     ff_framesync_uninit(&s->fs);
+    if (s->overlay_lock_inited) {
+        pthread_mutex_destroy(&s->overlay_lock);
+        s->overlay_lock_inited = 0;
+    }
+}
+
+static int cuda_grid_process_command(AVFilterContext *ctx, const char *cmd,
+                                     const char *arg, char *res, int res_len, int flags)
+{
+    CudaGridContext *s = ctx->priv;
+    int ret;
+
+    if (!strcmp(cmd, "add_overlay")) {
+        GridOverlay ov;
+        ret = parse_overlay_args(ctx, arg, &ov);
+        if (ret < 0) {
+            if (res) av_strlcpy(res, "err parse", res_len);
+            return ret;
+        }
+        pthread_mutex_lock(&s->overlay_lock);
+        ret = overlay_upsert_locked(s, &ov);
+        pthread_mutex_unlock(&s->overlay_lock);
+        if (res) snprintf(res, res_len, ret == 0 ? "ok id=%s n=%d" : "err full id=%s",
+                          ov.id, s->nb_overlays);
+        return ret;
+    }
+
+    if (!strcmp(cmd, "remove_overlay")) {
+        char id[OVERLAY_ID_MAX];
+        if (!arg || sscanf(arg, "%31s", id) != 1) {
+            if (res) av_strlcpy(res, "err parse", res_len);
+            return AVERROR(EINVAL);
+        }
+        pthread_mutex_lock(&s->overlay_lock);
+        ret = overlay_remove_locked(s, id);
+        pthread_mutex_unlock(&s->overlay_lock);
+        if (res) snprintf(res, res_len, ret == 0 ? "ok id=%s" : "not_found id=%s", id);
+        return ret;
+    }
+
+    if (!strcmp(cmd, "clear_overlays")) {
+        pthread_mutex_lock(&s->overlay_lock);
+        s->nb_overlays = 0;
+        pthread_mutex_unlock(&s->overlay_lock);
+        if (res) av_strlcpy(res, "ok", res_len);
+        return 0;
+    }
+
+    /* Future: set_layout (Phase 3 на filter side — пока nb_inputs хардкодится в init) */
+
+    /* Fall back to standard option/command handling */
+    return ff_filter_process_command(ctx, cmd, arg, res, res_len, flags);
 }
 
 static int cuda_grid_config_input(AVFilterLink *inlink)
@@ -465,16 +844,17 @@ static const AVFilterPad cuda_grid_outputs[] = {
 };
 
 const AVFilter ff_vf_cuda_grid = {
-    .name           = "cuda_grid",
-    .description    = NULL_IF_CONFIG_SMALL("GPU-native video grid composer (CUDA)."),
-    .priv_class     = &cuda_grid_class,
-    .priv_size      = sizeof(CudaGridContext),
-    .init           = cuda_grid_init,
-    .uninit         = cuda_grid_uninit,
-    .activate       = cuda_grid_activate,
+    .name            = "cuda_grid",
+    .description     = NULL_IF_CONFIG_SMALL("GPU-native video grid composer (CUDA)."),
+    .priv_class      = &cuda_grid_class,
+    .priv_size       = sizeof(CudaGridContext),
+    .init            = cuda_grid_init,
+    .uninit          = cuda_grid_uninit,
+    .activate        = cuda_grid_activate,
+    .process_command = cuda_grid_process_command,
     /* No FILTER_INPUTS — pads added dynamically в init() per layout. */
     FILTER_OUTPUTS(cuda_grid_outputs),
     FILTER_SINGLE_PIXFMT(AV_PIX_FMT_CUDA),
-    .flags          = AVFILTER_FLAG_HWDEVICE | AVFILTER_FLAG_DYNAMIC_INPUTS,
-    .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
+    .flags           = AVFILTER_FLAG_HWDEVICE | AVFILTER_FLAG_DYNAMIC_INPUTS,
+    .flags_internal  = FF_FILTER_FLAG_HWFRAME_AWARE,
 };
-- 
2.52.0


From 8ca590004b48ba11b5c80900374031d0bf625246 Mon Sep 17 00:00:00 2001
From: gx <gx@goldix.org>
Date: Tue, 19 May 2026 22:24:01 +0100
Subject: [PATCH 08/14] =?UTF-8?q?vf=5Fcuda=5Fgrid:=20Phase=204b-2=20?=
 =?UTF-8?q?=E2=80=94=20CUDA=20kernels=20(alpha-blended=20overlays)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Custom .cu kernels компилируются через ffbuild → .ptx, embedded в binary
через bin2c pattern (как vf_scale_cuda). Loading через ff_cuda_load_module
в config_output, kernel handles cached на life-of-filter.

Kernels:
  Alpha_Fill_Y/UV          — solid colour α-blend (rect, dim, border strips)
  Alpha_Blit_RGBA_Y/UV     — blit RGBA atlas → NV12 (text/icon в 4b-3/4)

Render side:
  render_strip_alpha       — заменил render_strip_solid (alpha=255 ≡ solid)
  render_overlay_rect      — opacity передаётся в kernel
  render_overlay_dim       — Y=16/UV=neutral × dim.amount (затемнение region)

Также: убран unused cuda_grid_config_input (-Werror), добавлены fields
CUmodule + 4×CUfunction в Context; unload в uninit.

cuMemsetD2D* НЕ доступны в FFmpeg's CudaFunctions wrapper, поэтому solid
fill реализован через kernel — лишний overhead minimal (16×16 threads).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 libavfilter/Makefile        |   3 +-
 libavfilter/vf_cuda_grid.c  | 143 +++++++++++++++++++++++++++---------
 libavfilter/vf_cuda_grid.cu | 121 ++++++++++++++++++++++++++++++
 3 files changed, 230 insertions(+), 37 deletions(-)
 create mode 100644 libavfilter/vf_cuda_grid.cu

diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index 8015a49..f570821 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -410,7 +410,8 @@ OBJS-$(CONFIG_OSCILLOSCOPE_FILTER)           += vf_datascope.o
 OBJS-$(CONFIG_OVERLAY_FILTER)                += vf_overlay.o framesync.o
 OBJS-$(CONFIG_OVERLAY_CUDA_FILTER)           += vf_overlay_cuda.o framesync.o vf_overlay_cuda.ptx.o \
                                                 cuda/load_helper.o
-OBJS-$(CONFIG_CUDA_GRID_FILTER)              += vf_cuda_grid.o framesync.o
+OBJS-$(CONFIG_CUDA_GRID_FILTER)              += vf_cuda_grid.o framesync.o \
+                                                vf_cuda_grid.ptx.o cuda/load_helper.o
 OBJS-$(CONFIG_OVERLAY_OPENCL_FILTER)         += vf_overlay_opencl.o opencl.o \
                                                 opencl/overlay.o framesync.o
 OBJS-$(CONFIG_OVERLAY_QSV_FILTER)            += vf_overlay_qsv.o framesync.o
diff --git a/libavfilter/vf_cuda_grid.c b/libavfilter/vf_cuda_grid.c
index 9f5cc51..675fb15 100644
--- a/libavfilter/vf_cuda_grid.c
+++ b/libavfilter/vf_cuda_grid.c
@@ -43,11 +43,19 @@
 #include "libavutil/thread.h"
 
 #include "avfilter.h"
+#include "cuda/load_helper.h"
 #include "filters.h"
 #include "formats.h"
 #include "framesync.h"
 #include "video.h"
 
+extern const unsigned char ff_vf_cuda_grid_ptx_data[];
+extern const unsigned int  ff_vf_cuda_grid_ptx_len;
+
+#define BLOCKX 16
+#define BLOCKY 16
+#define DIV_UP(a, b) (((a) + (b) - 1) / (b))
+
 #define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x)
 #define MAX_CELLS    16
 #define MAX_OVERLAYS 64
@@ -203,6 +211,13 @@ typedef struct CudaGridContext {
     int             nb_overlays;
     pthread_mutex_t overlay_lock;
     int             overlay_lock_inited;
+
+    /* CUDA kernels (Phase 4b-2) — loaded from embedded .ptx */
+    CUmodule    cu_module;
+    CUfunction  cu_func_alpha_fill_y;
+    CUfunction  cu_func_alpha_fill_uv;
+    CUfunction  cu_func_alpha_blit_rgba_y;
+    CUfunction  cu_func_alpha_blit_rgba_uv;
 } CudaGridContext;
 
 /* ─── Composition: copy одного input plane в target region output ──────── */
@@ -396,31 +411,49 @@ static int overlay_pixel_rect(CudaGridContext *s, const GridOverlay *ov,
     return (rw > 0 && rh > 0) ? 0 : 1;  /* 1 → empty, skip */
 }
 
-/* Solid filled strip (no alpha). cu ctx must be pushed. */
-static int render_strip_solid(AVFilterContext *ctx, AVFrame *out,
+/* Strip α-blend (opacity 0..255). cu ctx must be pushed.
+ * opacity=255 → solid fill (kernel uses single-pass blend formula). */
+static int render_strip_alpha(AVFilterContext *ctx, AVFrame *out,
                               int x, int y, int w, int h,
-                              uint8_t Y, uint8_t U, uint8_t V)
+                              uint8_t Y, uint8_t U, uint8_t V, uint8_t alpha)
 {
     CudaGridContext *s = ctx->priv;
     CUdeviceptr dst_y, dst_uv;
-    unsigned short uv;
     int ret;
+    int iY = Y, iU = U, iV = V, iA = alpha;
+    int rx, ry, rw, rh;
 
     x &= ~1; y &= ~1; w &= ~1; h &= ~1;
-    if (w <= 0 || h <= 0) return 0;
+    if (w <= 0 || h <= 0 || alpha == 0) return 0;
 
-    dst_y = (CUdeviceptr)(out->data[0] + (size_t)y * out->linesize[0] + x);
-    ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuMemsetD2D8Async(
-        dst_y, out->linesize[0], Y, (size_t)w, (size_t)h, s->cu_stream));
-    if (ret < 0) return ret;
+    /* Y plane */
+    dst_y = (CUdeviceptr)out->data[0];
+    rx = x; ry = y; rw = w; rh = h;
+    {
+        int dst_pitch_y = out->linesize[0];
+        void *args[] = { &dst_y, &dst_pitch_y, &rx, &ry, &rw, &rh, &iY, &iA };
+        ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuLaunchKernel(
+            s->cu_func_alpha_fill_y,
+            DIV_UP(w, BLOCKX), DIV_UP(h, BLOCKY), 1,
+            BLOCKX, BLOCKY, 1,
+            0, s->cu_stream, args, NULL));
+        if (ret < 0) return ret;
+    }
 
-    /* NV12 UV interleaved, half res. cuMemsetD2D16Async sets 16-bit element
-     * = UV pair. Little-endian: low byte=U, high byte=V. */
-    uv = (unsigned short)U | ((unsigned short)V << 8);
-    dst_uv = (CUdeviceptr)(out->data[1] + (size_t)(y / 2) * out->linesize[1] + x);
-    ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuMemsetD2D16Async(
-        dst_uv, out->linesize[1], uv, (size_t)(w / 2), (size_t)(h / 2), s->cu_stream));
-    return ret;
+    /* UV plane (half res, interleaved 2 bytes per chroma sample) */
+    dst_uv = (CUdeviceptr)out->data[1];
+    rx = x / 2; ry = y / 2; rw = w / 2; rh = h / 2;
+    {
+        int dst_pitch_uv = out->linesize[1];
+        void *args[] = { &dst_uv, &dst_pitch_uv, &rx, &ry, &rw, &rh, &iU, &iV, &iA };
+        ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuLaunchKernel(
+            s->cu_func_alpha_fill_uv,
+            DIV_UP(w / 2, BLOCKX), DIV_UP(h / 2, BLOCKY), 1,
+            BLOCKX, BLOCKY, 1,
+            0, s->cu_stream, args, NULL));
+        if (ret < 0) return ret;
+    }
+    return 0;
 }
 
 static int render_overlay_rect(AVFilterContext *ctx, AVFrame *out, const GridOverlay *ov)
@@ -436,28 +469,42 @@ static int render_overlay_rect(AVFilterContext *ctx, AVFrame *out, const GridOve
     rgb_to_yuv709(ov->u.rect.r, ov->u.rect.g, ov->u.rect.b, &Y, &U, &V);
 
     if (ov->u.rect.thickness <= 0) {
-        /* Filled */
-        return render_strip_solid(ctx, out, px, py, pw, ph, Y, U, V);
+        /* Filled rect */
+        return render_strip_alpha(ctx, out, px, py, pw, ph, Y, U, V, ov->opacity);
     } else {
         int t = ov->u.rect.thickness;
         t = FFMIN(t, FFMIN(pw / 2, ph / 2));
         if (t < 2) t = 2;
         t &= ~1;
         /* Top */
-        ret = render_strip_solid(ctx, out, px, py, pw, t, Y, U, V);
+        ret = render_strip_alpha(ctx, out, px, py, pw, t, Y, U, V, ov->opacity);
         if (ret < 0) return ret;
         /* Bottom */
-        ret = render_strip_solid(ctx, out, px, py + ph - t, pw, t, Y, U, V);
+        ret = render_strip_alpha(ctx, out, px, py + ph - t, pw, t, Y, U, V, ov->opacity);
         if (ret < 0) return ret;
         /* Left */
-        ret = render_strip_solid(ctx, out, px, py + t, t, ph - 2 * t, Y, U, V);
+        ret = render_strip_alpha(ctx, out, px, py + t, t, ph - 2 * t, Y, U, V, ov->opacity);
         if (ret < 0) return ret;
         /* Right */
-        ret = render_strip_solid(ctx, out, px + pw - t, py + t, t, ph - 2 * t, Y, U, V);
+        ret = render_strip_alpha(ctx, out, px + pw - t, py + t, t, ph - 2 * t, Y, U, V, ov->opacity);
         return ret;
     }
 }
 
+/* Dim overlay: рамка тёмная (Y=16, neutral UV) полупрозрачная по cell. */
+static int render_overlay_dim(AVFilterContext *ctx, AVFrame *out, const GridOverlay *ov)
+{
+    CudaGridContext *s = ctx->priv;
+    int px, py, pw, ph;
+    int ret;
+
+    ret = overlay_pixel_rect(s, ov, &px, &py, &pw, &ph);
+    if (ret != 0) return ret < 0 ? ret : 0;
+
+    /* Y=16 = darkest legal Y (BT.709 limited range), UV=128 = neutral chroma */
+    return render_strip_alpha(ctx, out, px, py, pw, ph, 16, 128, 128, ov->u.dim.amount);
+}
+
 /* Render all visible overlays. cu ctx must be pushed by caller. */
 static int render_overlays(AVFilterContext *ctx, AVFrame *out)
 {
@@ -487,7 +534,8 @@ static int render_overlays(AVFilterContext *ctx, AVFrame *out)
             if (ret < 0) return ret;
             break;
         case OV_TYPE_DIM:
-            av_log(ctx, AV_LOG_TRACE, "overlay %s: dim type — Phase 4b-2 (alpha kernel)\n", ov->id);
+            ret = render_overlay_dim(ctx, out, ov);
+            if (ret < 0) return ret;
             break;
         case OV_TYPE_TEXT:
             av_log(ctx, AV_LOG_TRACE, "overlay %s: text — Phase 4b-3 (freetype)\n", ov->id);
@@ -641,6 +689,13 @@ static av_cold void cuda_grid_uninit(AVFilterContext *ctx)
 {
     CudaGridContext *s = ctx->priv;
     ff_framesync_uninit(&s->fs);
+    if (s->cu_module && s->hwctx) {
+        CUcontext dummy;
+        CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPushCurrent(s->cu_ctx));
+        CHECK_CU(s->hwctx->internal->cuda_dl->cuModuleUnload(s->cu_module));
+        CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
+        s->cu_module = NULL;
+    }
     if (s->overlay_lock_inited) {
         pthread_mutex_destroy(&s->overlay_lock);
         s->overlay_lock_inited = 0;
@@ -695,19 +750,6 @@ static int cuda_grid_process_command(AVFilterContext *ctx, const char *cmd,
     return ff_filter_process_command(ctx, cmd, arg, res, res_len, flags);
 }
 
-static int cuda_grid_config_input(AVFilterLink *inlink)
-{
-    AVFilterContext *ctx = inlink->src;
-    FilterLink      *inl = ff_filter_link(inlink);
-
-    if (!inl->hw_frames_ctx || !inl->hw_frames_ctx->data) {
-        av_log(ctx, AV_LOG_ERROR, "input %s: software pixel format не поддерживается\n",
-               inlink->dstpad->name);
-        return AVERROR(EINVAL);
-    }
-    return 0;
-}
-
 static int cuda_grid_config_output(AVFilterLink *outlink)
 {
     AVFilterContext     *ctx  = outlink->src;
@@ -773,6 +815,35 @@ static int cuda_grid_config_output(AVFilterLink *outlink)
     s->cu_ctx   = s->hwctx->cuda_ctx;
     s->cu_stream = s->hwctx->stream;
 
+    /* Load CUDA module + resolve kernel handles (Phase 4b-2 alpha kernels) */
+    {
+        CUcontext dummy;
+        ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPushCurrent(s->cu_ctx));
+        if (ret < 0) return ret;
+
+        ret = ff_cuda_load_module(ctx, s->hwctx, &s->cu_module,
+                                  ff_vf_cuda_grid_ptx_data, ff_vf_cuda_grid_ptx_len);
+        if (ret < 0) {
+            CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
+            return ret;
+        }
+
+#define GET_FN(handle, name)                                                                \
+        ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuModuleGetFunction(                    \
+                       &s->handle, s->cu_module, name));                                    \
+        if (ret < 0) {                                                                       \
+            CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));                 \
+            return ret;                                                                      \
+        }
+        GET_FN(cu_func_alpha_fill_y,        "Alpha_Fill_Y");
+        GET_FN(cu_func_alpha_fill_uv,       "Alpha_Fill_UV");
+        GET_FN(cu_func_alpha_blit_rgba_y,   "Alpha_Blit_RGBA_Y");
+        GET_FN(cu_func_alpha_blit_rgba_uv,  "Alpha_Blit_RGBA_UV");
+#undef GET_FN
+
+        CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
+    }
+
     out_ref = av_hwframe_ctx_alloc(hwfc0->device_ref);
     if (!out_ref)
         return AVERROR(ENOMEM);
diff --git a/libavfilter/vf_cuda_grid.cu b/libavfilter/vf_cuda_grid.cu
new file mode 100644
index 0000000..10d2af6
--- /dev/null
+++ b/libavfilter/vf_cuda_grid.cu
@@ -0,0 +1,121 @@
+/*
+ * cuda_grid overlay CUDA kernels.
+ *
+ * Алфа-блендинг poверх NV12 frame:
+ *   - Alpha_Fill_Y / Alpha_Fill_UV: solid colour fill region (rect/dim)
+ *   - Alpha_Blit_RGBA_Y / Alpha_Blit_RGBA_UV: blit RGBA atlas → NV12
+ *     (для text Phase 4b-3 и icon Phase 4b-4)
+ *
+ * BT.709 limited-range conversion (HDTV). См. также vf_cuda_grid.c rgb_to_yuv709.
+ *
+ * Лицензия: LGPL-2.1+ (соответствует FFmpeg)
+ */
+
+#include "cuda/vector_helpers.cuh"
+
+extern "C" {
+
+/* Solid colour α-blend на Y plane.
+ *   dst: pointer на Y plane base
+ *   dst_pitch: bytes per row
+ *   rx, ry, rw, rh: region rect (pixels, must be in-bounds)
+ *   fill: 0..255 (Y component to blend in)
+ *   alpha: 0..255 (255 = fully opaque)
+ */
+__global__ void Alpha_Fill_Y(unsigned char *dst, int dst_pitch,
+                             int rx, int ry, int rw, int rh,
+                             int fill, int alpha)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= rw || y >= rh) return;
+    unsigned char *p = dst + (ry + y) * dst_pitch + (rx + x);
+    int cur = *p;
+    *p = (unsigned char)((fill * alpha + cur * (255 - alpha)) / 255);
+}
+
+/* Solid colour α-blend на UV plane (NV12 interleaved).
+ *   rx, ry: in chroma plane coords (= full-res Y x/2, y/2)
+ *   rw, rh: also в chroma coords
+ */
+__global__ void Alpha_Fill_UV(unsigned char *dst, int dst_pitch,
+                              int rx, int ry, int rw, int rh,
+                              int fill_u, int fill_v, int alpha)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= rw || y >= rh) return;
+    unsigned char *p = dst + (ry + y) * dst_pitch + (rx + x) * 2;
+    int cu = p[0], cv = p[1];
+    p[0] = (unsigned char)((fill_u * alpha + cu * (255 - alpha)) / 255);
+    p[1] = (unsigned char)((fill_v * alpha + cv * (255 - alpha)) / 255);
+}
+
+/* Blit RGBA atlas → Y plane с α-blending.
+ *   dst, dst_pitch: Y plane
+ *   dx, dy: destination pixel offset (full-res coords)
+ *   atlas, atlas_pitch: RGBA source (interleaved R,G,B,A bytes), pitch в bytes
+ *   w, h: atlas dimensions (in pixels)
+ *   extra_alpha: дополнительный множитель (0..255) — overlay-level opacity
+ */
+__global__ void Alpha_Blit_RGBA_Y(unsigned char *dst, int dst_pitch,
+                                  int dx, int dy,
+                                  const unsigned char *atlas, int atlas_pitch,
+                                  int w, int h, int extra_alpha)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= w || y >= h) return;
+    const unsigned char *sp = atlas + y * atlas_pitch + x * 4;
+    int r = sp[0], g = sp[1], b = sp[2], a = sp[3];
+    a = a * extra_alpha / 255;
+    if (a == 0) return;
+    int Y = (int)(0.183f * r + 0.614f * g + 0.062f * b) + 16;
+    Y = Y < 0 ? 0 : (Y > 255 ? 255 : Y);
+    unsigned char *p = dst + (dy + y) * dst_pitch + (dx + x);
+    int cur = *p;
+    *p = (unsigned char)((Y * a + cur * (255 - a)) / 255);
+}
+
+/* Blit RGBA atlas → UV plane с 4:2:0 chroma subsampling.
+ *   dst, dst_pitch: UV plane
+ *   dx, dy: destination pixel offset (full-res coords; обе должны быть кратны 2)
+ *   atlas, atlas_pitch: RGBA source
+ *   w, h: atlas dimensions (full-res; UV operates на w/2 × h/2)
+ *   extra_alpha: 0..255
+ */
+__global__ void Alpha_Blit_RGBA_UV(unsigned char *dst, int dst_pitch,
+                                   int dx, int dy,
+                                   const unsigned char *atlas, int atlas_pitch,
+                                   int w, int h, int extra_alpha)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    int hw = w / 2, hh = h / 2;
+    if (x >= hw || y >= hh) return;
+
+    int sx = x * 2, sy = y * 2;
+    const unsigned char *row0 = atlas + sy * atlas_pitch;
+    const unsigned char *row1 = atlas + (sy + 1) * atlas_pitch;
+
+    int r = (row0[sx*4+0] + row0[(sx+1)*4+0] + row1[sx*4+0] + row1[(sx+1)*4+0]) >> 2;
+    int g = (row0[sx*4+1] + row0[(sx+1)*4+1] + row1[sx*4+1] + row1[(sx+1)*4+1]) >> 2;
+    int b = (row0[sx*4+2] + row0[(sx+1)*4+2] + row1[sx*4+2] + row1[(sx+1)*4+2]) >> 2;
+    int a = (row0[sx*4+3] + row0[(sx+1)*4+3] + row1[sx*4+3] + row1[(sx+1)*4+3]) >> 2;
+    a = a * extra_alpha / 255;
+    if (a == 0) return;
+
+    int U = (int)(-0.101f * r - 0.339f * g + 0.439f * b) + 128;
+    int V = (int)( 0.439f * r - 0.399f * g - 0.040f * b) + 128;
+    U = U < 0 ? 0 : (U > 255 ? 255 : U);
+    V = V < 0 ? 0 : (V > 255 ? 255 : V);
+
+    int du = dx / 2 + x;
+    int dv = dy / 2 + y;
+    unsigned char *p = dst + dv * dst_pitch + du * 2;
+    int cu = p[0], cv = p[1];
+    p[0] = (unsigned char)((U * a + cu * (255 - a)) / 255);
+    p[1] = (unsigned char)((V * a + cv * (255 - a)) / 255);
+}
+
+}  /* extern "C" */
-- 
2.52.0


From 1e54f04e247143d8bf5312dc17292e6f591909e7 Mon Sep 17 00:00:00 2001
From: gx <gx@goldix.org>
Date: Tue, 19 May 2026 22:26:12 +0100
Subject: [PATCH 09/14] =?UTF-8?q?configure:=20add=20cuda=5Fgrid=5Ffilter?=
 =?UTF-8?q?=5Fdeps=5Fany=20=D0=B4=D0=BB=D1=8F=20cuda=5Fnvcc/cuda=5Fllvm?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Без этого filter compileд но .ptx step failед: configure не knows что
filter требует nvcc/clang для .cu compile. Same pattern как scale_cuda.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 configure | 1 +
 1 file changed, 1 insertion(+)

diff --git a/configure b/configure
index 9c60cb7..b94029e 100755
--- a/configure
+++ b/configure
@@ -3318,6 +3318,7 @@ transpose_npp_filter_deps="ffnvcodec libnpp"
 overlay_cuda_filter_deps="ffnvcodec"
 overlay_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 cuda_grid_filter_deps="ffnvcodec"
+cuda_grid_filter_deps_any="cuda_nvcc cuda_llvm"
 sharpen_npp_filter_deps="ffnvcodec libnpp"
 
 ddagrab_filter_deps="d3d11va IDXGIOutput1 DXGI_OUTDUPL_FRAME_INFO"
-- 
2.52.0


From 40104613009c0482fa9d9bfd4910245053a6ec68 Mon Sep 17 00:00:00 2001
From: gx <gx@goldix.org>
Date: Tue, 19 May 2026 22:30:36 +0100
Subject: [PATCH 10/14] =?UTF-8?q?vf=5Fcuda=5Fgrid:=20Phase=204b-3=20?=
 =?UTF-8?q?=E2=80=94=20text=20overlay=20(freetype=20+=20RGBA=20atlas)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CPU rasterization через freetype (DejaVu/Liberation auto-detect или
font_file= option), upload pitched RGBA buffer to GPU, blit через
Alpha_Blit_RGBA_Y/UV kernels (Phase 4b-2 уже had).

Cache:
  per-overlay-id atlas с keys (text, font_size, r/g/b) — re-rasterize
  только при change. Cleanup при remove_overlay/clear_overlays/uninit.

Options:
  font_file=  — TTF path (default: search DejaVu/Liberation)
  font_size=  — default size if overlay не указал свой

Wire format: add_overlay <id> text x=.. y=.. text=hello font_size=24 r=255 g=255 b=255 opacity=200

Conditional на CONFIG_LIBFREETYPE — без него text overlays no-op
(остальные типы работают как обычно).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 libavfilter/vf_cuda_grid.c | 417 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 413 insertions(+), 4 deletions(-)

diff --git a/libavfilter/vf_cuda_grid.c b/libavfilter/vf_cuda_grid.c
index 675fb15..11735e1 100644
--- a/libavfilter/vf_cuda_grid.c
+++ b/libavfilter/vf_cuda_grid.c
@@ -31,6 +31,8 @@
 
 #include "config_components.h"
 
+#include <unistd.h>
+
 #include "libavutil/avstring.h"
 #include "libavutil/common.h"
 #include "libavutil/cuda_check.h"
@@ -42,6 +44,12 @@
 #include "libavutil/pixdesc.h"
 #include "libavutil/thread.h"
 
+#include "config.h"
+#if CONFIG_LIBFREETYPE
+#include <ft2build.h>
+#include FT_FREETYPE_H
+#endif
+
 #include "avfilter.h"
 #include "cuda/load_helper.h"
 #include "filters.h"
@@ -183,6 +191,17 @@ static const LayoutTemplate *find_layout(const char *name)
 
 /* ─── Filter state ─────────────────────────────────────────────────────── */
 
+/* Text atlas — rasterized RGBA buffer on GPU, keyed by overlay id (Phase 4b-3). */
+typedef struct TextAtlas {
+    char        id[OVERLAY_ID_MAX];
+    char        cached_text[OVERLAY_TEXT_MAX];
+    int         cached_font_size;
+    uint8_t     cached_r, cached_g, cached_b;
+    CUdeviceptr device_ptr;
+    size_t      device_pitch;
+    int         w, h;
+} TextAtlas;
+
 typedef struct CudaGridContext {
     const AVClass *class;
 
@@ -190,6 +209,8 @@ typedef struct CudaGridContext {
     char *layout_name;
     int   out_width;
     int   out_height;
+    char *font_file;
+    int   default_font_size;
 
     /* Resolved layout (после init) */
     const LayoutTemplate *layout;
@@ -218,6 +239,16 @@ typedef struct CudaGridContext {
     CUfunction  cu_func_alpha_fill_uv;
     CUfunction  cu_func_alpha_blit_rgba_y;
     CUfunction  cu_func_alpha_blit_rgba_uv;
+
+    /* Text rendering (Phase 4b-3, libfreetype optional) */
+#if CONFIG_LIBFREETYPE
+    FT_Library  ft_lib;
+    FT_Face     ft_face;
+    int         ft_ready;          /* 1 once library + face loaded */
+    int         ft_init_attempted; /* set on first attempt, success or fail */
+#endif
+    TextAtlas   atlases[MAX_OVERLAYS];
+    int         nb_atlases;
 } CudaGridContext;
 
 /* ─── Composition: copy одного input plane в target region output ──────── */
@@ -456,6 +487,344 @@ static int render_strip_alpha(AVFilterContext *ctx, AVFrame *out,
     return 0;
 }
 
+/* ─── Text rendering (Phase 4b-3, freetype) ────────────────────────────── */
+
+#if CONFIG_LIBFREETYPE
+
+static const char *const default_fonts[] = {
+    "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
+    "/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf",
+    "/usr/share/fonts/TTF/DejaVuSans-Bold.ttf",
+    "/usr/share/fonts/dejavu/DejaVuSans-Bold.ttf",
+    NULL,
+};
+
+static int ensure_ft_loaded(AVFilterContext *ctx)
+{
+    CudaGridContext *s = ctx->priv;
+    const char *path = s->font_file;
+    FT_Error err;
+    int i;
+
+    if (s->ft_ready) return 0;
+    if (s->ft_init_attempted) return AVERROR(ENOSYS);
+    s->ft_init_attempted = 1;
+
+    err = FT_Init_FreeType(&s->ft_lib);
+    if (err) {
+        av_log(ctx, AV_LOG_WARNING, "FT_Init_FreeType failed (err=%d), text overlays disabled\n", err);
+        return AVERROR(ENOSYS);
+    }
+
+    if (!path) {
+        for (i = 0; default_fonts[i]; i++) {
+            if (access(default_fonts[i], R_OK) == 0) {
+                path = default_fonts[i];
+                break;
+            }
+        }
+    }
+    if (!path) {
+        av_log(ctx, AV_LOG_WARNING, "no font found (set font_file= option), text disabled\n");
+        FT_Done_FreeType(s->ft_lib);
+        s->ft_lib = NULL;
+        return AVERROR(ENOENT);
+    }
+
+    err = FT_New_Face(s->ft_lib, path, 0, &s->ft_face);
+    if (err) {
+        av_log(ctx, AV_LOG_WARNING, "FT_New_Face(%s) failed: %d, text disabled\n", path, err);
+        FT_Done_FreeType(s->ft_lib);
+        s->ft_lib = NULL;
+        return AVERROR(EIO);
+    }
+
+    av_log(ctx, AV_LOG_INFO, "freetype: loaded %s\n", path);
+    s->ft_ready = 1;
+    return 0;
+}
+
+/* Two-pass rasterize: 1) measure total_w / ascent / descent, 2) blit to RGBA buffer.
+ * Returns malloc'd RGBA buffer (caller frees) + dims via out params, or NULL. */
+static uint8_t *rasterize_text_rgba(AVFilterContext *ctx, const char *text, int font_size,
+                                    uint8_t r, uint8_t g, uint8_t b,
+                                    int *out_w, int *out_h)
+{
+    CudaGridContext *s = ctx->priv;
+    FT_Error err;
+    const unsigned char *p;
+    int total_w = 0, ascent_px = 0, descent_px = 0;
+    int atlas_w, atlas_h, pen_x, baseline_y;
+    uint8_t *atlas;
+
+    if (!s->ft_ready) return NULL;
+    err = FT_Set_Pixel_Sizes(s->ft_face, 0, font_size);
+    if (err) {
+        av_log(ctx, AV_LOG_WARNING, "FT_Set_Pixel_Sizes failed: %d\n", err);
+        return NULL;
+    }
+
+    /* Pass 1: measure */
+    for (p = (const unsigned char *)text; *p; p++) {
+        FT_UInt gi;
+        FT_GlyphSlot g_slot;
+        gi = FT_Get_Char_Index(s->ft_face, *p);
+        if (!gi) continue;
+        if (FT_Load_Glyph(s->ft_face, gi, FT_LOAD_DEFAULT)) continue;
+        g_slot = s->ft_face->glyph;
+        total_w  += (g_slot->advance.x >> 6);
+        ascent_px  = FFMAX(ascent_px,  g_slot->bitmap_top);
+        descent_px = FFMAX(descent_px, (int)g_slot->bitmap.rows - g_slot->bitmap_top);
+    }
+    if (total_w <= 0) return NULL;
+
+    atlas_w = total_w + 4;  /* padding */
+    atlas_h = ascent_px + descent_px + 4;
+    if (atlas_h <= 0) atlas_h = font_size;
+
+    atlas = av_mallocz((size_t)atlas_w * atlas_h * 4);
+    if (!atlas) return NULL;
+
+    /* Pass 2: blit */
+    pen_x      = 2;
+    baseline_y = ascent_px + 2;
+    for (p = (const unsigned char *)text; *p; p++) {
+        FT_UInt gi;
+        FT_GlyphSlot g_slot;
+        int gx, gy, bw, bh;
+        int yy, xx;
+
+        gi = FT_Get_Char_Index(s->ft_face, *p);
+        if (!gi) continue;
+        if (FT_Load_Glyph(s->ft_face, gi, FT_LOAD_RENDER)) continue;
+        g_slot = s->ft_face->glyph;
+        if (g_slot->bitmap.pixel_mode != FT_PIXEL_MODE_GRAY) {
+            pen_x += g_slot->advance.x >> 6;
+            continue;
+        }
+        bw = g_slot->bitmap.width;
+        bh = g_slot->bitmap.rows;
+        gx = pen_x + g_slot->bitmap_left;
+        gy = baseline_y - g_slot->bitmap_top;
+
+        for (yy = 0; yy < bh; yy++) {
+            int dy = gy + yy;
+            if (dy < 0 || dy >= atlas_h) continue;
+            for (xx = 0; xx < bw; xx++) {
+                int dx = gx + xx;
+                uint8_t gray;
+                uint8_t *dst;
+                if (dx < 0 || dx >= atlas_w) continue;
+                gray = g_slot->bitmap.buffer[yy * g_slot->bitmap.pitch + xx];
+                if (gray == 0) continue;
+                dst = atlas + ((size_t)dy * atlas_w + dx) * 4;
+                /* Premultiplied: max() — text "fills" rather than blending overlaps */
+                if (gray > dst[3]) {
+                    dst[0] = r;
+                    dst[1] = g;
+                    dst[2] = b;
+                    dst[3] = gray;
+                }
+            }
+        }
+        pen_x += g_slot->advance.x >> 6;
+    }
+
+    *out_w = atlas_w;
+    *out_h = atlas_h;
+    return atlas;
+}
+
+#endif  /* CONFIG_LIBFREETYPE */
+
+/* Cache lookup — locked by caller. */
+static TextAtlas *atlas_find_locked(CudaGridContext *s, const char *id)
+{
+    int i;
+    for (i = 0; i < s->nb_atlases; i++)
+        if (!strcmp(s->atlases[i].id, id))
+            return &s->atlases[i];
+    return NULL;
+}
+
+static void atlas_free_gpu_locked(CudaGridContext *s, TextAtlas *a)
+{
+    if (a->device_ptr) {
+        s->hwctx->internal->cuda_dl->cuMemFree(a->device_ptr);
+        a->device_ptr = 0;
+    }
+}
+
+static void atlas_remove_locked(CudaGridContext *s, const char *id)
+{
+    int i;
+    for (i = 0; i < s->nb_atlases; i++) {
+        if (!strcmp(s->atlases[i].id, id)) {
+            atlas_free_gpu_locked(s, &s->atlases[i]);
+            memmove(&s->atlases[i], &s->atlases[i + 1],
+                    (s->nb_atlases - i - 1) * sizeof(TextAtlas));
+            s->nb_atlases--;
+            return;
+        }
+    }
+}
+
+#if CONFIG_LIBFREETYPE
+/* Ensure GPU atlas for given overlay matches its text/font_size/color. cu ctx pushed by caller. */
+static int ensure_text_atlas(AVFilterContext *ctx, const GridOverlay *ov, TextAtlas **out_atlas)
+{
+    CudaGridContext *s = ctx->priv;
+    TextAtlas *a;
+    uint8_t *cpu_atlas;
+    int aw, ah;
+    int font_size;
+    CUdeviceptr dev_ptr = 0;
+    size_t dev_pitch = 0;
+    CUDA_MEMCPY2D cpy = { 0 };
+    int ret;
+
+    font_size = ov->u.text.font_size > 0 ? ov->u.text.font_size : s->default_font_size;
+
+    a = atlas_find_locked(s, ov->id);
+    if (a &&
+        !strcmp(a->cached_text, ov->u.text.text) &&
+        a->cached_font_size == font_size &&
+        a->cached_r == ov->u.text.r &&
+        a->cached_g == ov->u.text.g &&
+        a->cached_b == ov->u.text.b) {
+        *out_atlas = a;
+        return 0;
+    }
+
+    if (s->nb_atlases >= MAX_OVERLAYS && !a)
+        return AVERROR(ENOSPC);
+
+    ret = ensure_ft_loaded(ctx);
+    if (ret < 0) return ret;
+
+    cpu_atlas = rasterize_text_rgba(ctx, ov->u.text.text, font_size,
+                                    ov->u.text.r, ov->u.text.g, ov->u.text.b,
+                                    &aw, &ah);
+    if (!cpu_atlas) return AVERROR(EINVAL);
+
+    /* Allocate pitched device buffer + upload */
+    ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuMemAllocPitch(
+        &dev_ptr, &dev_pitch, (size_t)aw * 4, (size_t)ah, 4));
+    if (ret < 0) { av_free(cpu_atlas); return ret; }
+
+    cpy.srcMemoryType = CU_MEMORYTYPE_HOST;
+    cpy.srcHost       = cpu_atlas;
+    cpy.srcPitch      = (size_t)aw * 4;
+    cpy.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+    cpy.dstDevice     = dev_ptr;
+    cpy.dstPitch      = dev_pitch;
+    cpy.WidthInBytes  = (size_t)aw * 4;
+    cpy.Height        = (size_t)ah;
+    ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuMemcpy2D(&cpy));
+    av_free(cpu_atlas);
+    if (ret < 0) {
+        s->hwctx->internal->cuda_dl->cuMemFree(dev_ptr);
+        return ret;
+    }
+
+    if (a) {
+        atlas_free_gpu_locked(s, a);
+    } else {
+        a = &s->atlases[s->nb_atlases++];
+        memset(a, 0, sizeof(*a));
+        av_strlcpy(a->id, ov->id, sizeof(a->id));
+    }
+    av_strlcpy(a->cached_text, ov->u.text.text, sizeof(a->cached_text));
+    a->cached_font_size = font_size;
+    a->cached_r = ov->u.text.r;
+    a->cached_g = ov->u.text.g;
+    a->cached_b = ov->u.text.b;
+    a->device_ptr   = dev_ptr;
+    a->device_pitch = dev_pitch;
+    a->w = aw;
+    a->h = ah;
+    *out_atlas = a;
+    return 0;
+}
+
+static int render_overlay_text(AVFilterContext *ctx, AVFrame *out, const GridOverlay *ov)
+{
+    CudaGridContext *s = ctx->priv;
+    TextAtlas *a = NULL;
+    int px, py;
+    int base_x, base_y, base_w, base_h;
+    CUdeviceptr dst_y, dst_uv;
+    int ret;
+    int aw, ah, ap, ealpha;
+
+    if (!ov->u.text.text[0]) return 0;
+    ret = ensure_text_atlas(ctx, ov, &a);
+    if (ret < 0) {
+        av_log(ctx, AV_LOG_WARNING, "text overlay %s: atlas failed (ret=%d)\n", ov->id, ret);
+        return 0; /* non-fatal — skip */
+    }
+    if (!a) return 0;
+
+    /* Position: x,y normalized within cell or output (top-left corner of atlas). */
+    if (ov->cell < 0) {
+        base_x = 0; base_y = 0;
+        base_w = s->out_width; base_h = s->out_height;
+    } else if (ov->cell < s->layout->nb_cells) {
+        base_x = s->cell_px[ov->cell].x;
+        base_y = s->cell_px[ov->cell].y;
+        base_w = s->cell_px[ov->cell].w;
+        base_h = s->cell_px[ov->cell].h;
+    } else {
+        return 0;
+    }
+    px = base_x + (int)(ov->x * base_w);
+    py = base_y + (int)(ov->y * base_h);
+    px &= ~1; py &= ~1;
+    /* Clip — if atlas runs past output bounds, kernels still safe (they bounds-check
+     * against atlas w/h, not dst). But avoid going off output: simple skip when fully outside. */
+    if (px >= s->out_width || py >= s->out_height) return 0;
+
+    aw = a->w; ah = a->h;
+    /* Reduce ah if it overflows output (avoid OOB on Y plane); UV plane uses h/2 so same scale. */
+    if (px + aw > s->out_width)  aw = s->out_width  - px;
+    if (py + ah > s->out_height) ah = s->out_height - py;
+    aw &= ~1; ah &= ~1;
+    if (aw <= 0 || ah <= 0) return 0;
+
+    ap = (int)a->device_pitch;
+    ealpha = ov->opacity;
+
+    /* Y plane */
+    dst_y = (CUdeviceptr)out->data[0];
+    {
+        int dst_pitch_y = out->linesize[0];
+        CUdeviceptr atl = a->device_ptr;
+        void *args[] = { &dst_y, &dst_pitch_y, &px, &py, &atl, &ap, &aw, &ah, &ealpha };
+        ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuLaunchKernel(
+            s->cu_func_alpha_blit_rgba_y,
+            DIV_UP(aw, BLOCKX), DIV_UP(ah, BLOCKY), 1,
+            BLOCKX, BLOCKY, 1,
+            0, s->cu_stream, args, NULL));
+        if (ret < 0) return ret;
+    }
+    /* UV plane (kernel does internal half-res subsampling) */
+    dst_uv = (CUdeviceptr)out->data[1];
+    {
+        int dst_pitch_uv = out->linesize[1];
+        CUdeviceptr atl = a->device_ptr;
+        void *args[] = { &dst_uv, &dst_pitch_uv, &px, &py, &atl, &ap, &aw, &ah, &ealpha };
+        ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuLaunchKernel(
+            s->cu_func_alpha_blit_rgba_uv,
+            DIV_UP(aw / 2, BLOCKX), DIV_UP(ah / 2, BLOCKY), 1,
+            BLOCKX, BLOCKY, 1,
+            0, s->cu_stream, args, NULL));
+        if (ret < 0) return ret;
+    }
+    return 0;
+}
+
+#endif  /* CONFIG_LIBFREETYPE */
+
 static int render_overlay_rect(AVFilterContext *ctx, AVFrame *out, const GridOverlay *ov)
 {
     CudaGridContext *s = ctx->priv;
@@ -538,7 +907,12 @@ static int render_overlays(AVFilterContext *ctx, AVFrame *out)
             if (ret < 0) return ret;
             break;
         case OV_TYPE_TEXT:
-            av_log(ctx, AV_LOG_TRACE, "overlay %s: text — Phase 4b-3 (freetype)\n", ov->id);
+#if CONFIG_LIBFREETYPE
+            ret = render_overlay_text(ctx, out, ov);
+            if (ret < 0) return ret;
+#else
+            av_log(ctx, AV_LOG_TRACE, "overlay %s: text — libfreetype disabled at build\n", ov->id);
+#endif
             break;
         case OV_TYPE_ICON:
             av_log(ctx, AV_LOG_TRACE, "overlay %s: icon — Phase 4b-4 (sprite)\n", ov->id);
@@ -688,14 +1062,29 @@ static av_cold int cuda_grid_init(AVFilterContext *ctx)
 static av_cold void cuda_grid_uninit(AVFilterContext *ctx)
 {
     CudaGridContext *s = ctx->priv;
+    int i;
     ff_framesync_uninit(&s->fs);
-    if (s->cu_module && s->hwctx) {
+
+    if (s->hwctx) {
         CUcontext dummy;
         CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPushCurrent(s->cu_ctx));
-        CHECK_CU(s->hwctx->internal->cuda_dl->cuModuleUnload(s->cu_module));
+        for (i = 0; i < s->nb_atlases; i++) {
+            if (s->atlases[i].device_ptr) {
+                s->hwctx->internal->cuda_dl->cuMemFree(s->atlases[i].device_ptr);
+                s->atlases[i].device_ptr = 0;
+            }
+        }
+        s->nb_atlases = 0;
+        if (s->cu_module) {
+            CHECK_CU(s->hwctx->internal->cuda_dl->cuModuleUnload(s->cu_module));
+            s->cu_module = NULL;
+        }
         CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
-        s->cu_module = NULL;
     }
+#if CONFIG_LIBFREETYPE
+    if (s->ft_face) { FT_Done_Face(s->ft_face); s->ft_face = NULL; }
+    if (s->ft_lib)  { FT_Done_FreeType(s->ft_lib); s->ft_lib = NULL; }
+#endif
     if (s->overlay_lock_inited) {
         pthread_mutex_destroy(&s->overlay_lock);
         s->overlay_lock_inited = 0;
@@ -731,14 +1120,30 @@ static int cuda_grid_process_command(AVFilterContext *ctx, const char *cmd,
         }
         pthread_mutex_lock(&s->overlay_lock);
         ret = overlay_remove_locked(s, id);
+        /* Best-effort atlas GPU cleanup: cu ctx may not be pushed here, push it. */
+        if (ret == 0 && s->hwctx) {
+            CUcontext dummy;
+            CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPushCurrent(s->cu_ctx));
+            atlas_remove_locked(s, id);
+            CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
+        }
         pthread_mutex_unlock(&s->overlay_lock);
         if (res) snprintf(res, res_len, ret == 0 ? "ok id=%s" : "not_found id=%s", id);
         return ret;
     }
 
     if (!strcmp(cmd, "clear_overlays")) {
+        int i;
         pthread_mutex_lock(&s->overlay_lock);
         s->nb_overlays = 0;
+        if (s->hwctx) {
+            CUcontext dummy;
+            CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPushCurrent(s->cu_ctx));
+            for (i = 0; i < s->nb_atlases; i++)
+                atlas_free_gpu_locked(s, &s->atlases[i]);
+            s->nb_atlases = 0;
+            CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
+        }
         pthread_mutex_unlock(&s->overlay_lock);
         if (res) av_strlcpy(res, "ok", res_len);
         return 0;
@@ -899,6 +1304,10 @@ static const AVOption cuda_grid_options[] = {
       OFFSET(out_width),   AV_OPT_TYPE_INT,    { .i64 = 1920 }, 16, 16384, FLAGS },
     { "out_h", "высота output frame в пикселях",
       OFFSET(out_height),  AV_OPT_TYPE_INT,    { .i64 = 1080 }, 16, 16384, FLAGS },
+    { "font_file", "TTF/OTF font path (default: search DejaVu/Liberation)",
+      OFFSET(font_file), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, FLAGS },
+    { "font_size", "default text overlay size (px), используется если overlay не указал свой",
+      OFFSET(default_font_size), AV_OPT_TYPE_INT, { .i64 = 24 }, 6, 256, FLAGS },
     { NULL }
 };
 
-- 
2.52.0


From b88f966f835803d79fee14c783ce7d7024fb600a Mon Sep 17 00:00:00 2001
From: gx <gx@goldix.org>
Date: Tue, 19 May 2026 22:32:38 +0100
Subject: [PATCH 11/14] =?UTF-8?q?vf=5Fcuda=5Fgrid:=20URL-decode=20=D0=B4?=
 =?UTF-8?q?=D0=BB=D1=8F=20string=20overlay=20fields=20(text,=20icon=5Fname?=
 =?UTF-8?q?)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Controller URL-encode'ит string values (text="hello world" → text=hello%20world)
чтобы пройти sscanf("%s") который stops on whitespace.
Filter inline decode'ит '%xx' obratno в bytes. Только для text + icon_name.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 libavfilter/vf_cuda_grid.c | 36 ++++++++++++++++++++++++++++++++++--
 1 file changed, 34 insertions(+), 2 deletions(-)

diff --git a/libavfilter/vf_cuda_grid.c b/libavfilter/vf_cuda_grid.c
index 11735e1..1a45eb1 100644
--- a/libavfilter/vf_cuda_grid.c
+++ b/libavfilter/vf_cuda_grid.c
@@ -294,7 +294,33 @@ static av_always_inline void rgb_to_yuv709(uint8_t r, uint8_t g, uint8_t b,
     *V = av_clip_uint8(v);
 }
 
+/* In-place URL-decode (%20 etc.) — for string values приходящих от controller. */
+static void url_decode_inplace(char *s)
+{
+    char *r = s, *w = s;
+    while (*r) {
+        if (r[0] == '%' && r[1] && r[2]) {
+            int hi = -1, lo = -1;
+            char c1 = r[1], c2 = r[2];
+            if (c1 >= '0' && c1 <= '9') hi = c1 - '0';
+            else if (c1 >= 'A' && c1 <= 'F') hi = c1 - 'A' + 10;
+            else if (c1 >= 'a' && c1 <= 'f') hi = c1 - 'a' + 10;
+            if (c2 >= '0' && c2 <= '9') lo = c2 - '0';
+            else if (c2 >= 'A' && c2 <= 'F') lo = c2 - 'A' + 10;
+            else if (c2 >= 'a' && c2 <= 'f') lo = c2 - 'a' + 10;
+            if (hi >= 0 && lo >= 0) {
+                *w++ = (char)((hi << 4) | lo);
+                r += 3;
+                continue;
+            }
+        }
+        *w++ = *r++;
+    }
+    *w = '\0';
+}
+
 /* Parse args в формате: "<id> <type> <key>=<val> <key>=<val> ..."
+ * String values приходят URL-encoded (%20 = space) от controller.
  * Out *ov заполняется defaults + parsed values. Returns 0 / AVERROR. */
 static int parse_overlay_args(AVFilterContext *ctx, const char *args, GridOverlay *ov)
 {
@@ -361,13 +387,19 @@ static int parse_overlay_args(AVFilterContext *ctx, const char *args, GridOverla
             else if (!strcmp(key, "b"))         ov->u.rect.b = av_clip_uint8(atoi(val));
             else if (!strcmp(key, "thickness")) ov->u.rect.thickness = atoi(val);
         } else if (ov->type == OV_TYPE_TEXT) {
-            if      (!strcmp(key, "text"))      av_strlcpy(ov->u.text.text, val, sizeof(ov->u.text.text));
+            if      (!strcmp(key, "text"))      {
+                av_strlcpy(ov->u.text.text, val, sizeof(ov->u.text.text));
+                url_decode_inplace(ov->u.text.text);
+            }
             else if (!strcmp(key, "font_size")) ov->u.text.font_size = atoi(val);
             else if (!strcmp(key, "r"))         ov->u.text.r = av_clip_uint8(atoi(val));
             else if (!strcmp(key, "g"))         ov->u.text.g = av_clip_uint8(atoi(val));
             else if (!strcmp(key, "b"))         ov->u.text.b = av_clip_uint8(atoi(val));
         } else if (ov->type == OV_TYPE_ICON) {
-            if (!strcmp(key, "icon_name")) av_strlcpy(ov->u.icon.icon_name, val, sizeof(ov->u.icon.icon_name));
+            if (!strcmp(key, "icon_name")) {
+                av_strlcpy(ov->u.icon.icon_name, val, sizeof(ov->u.icon.icon_name));
+                url_decode_inplace(ov->u.icon.icon_name);
+            }
         } else if (ov->type == OV_TYPE_DIM) {
             if (!strcmp(key, "amount")) ov->u.dim.amount = av_clip_uint8(atoi(val));
         }
-- 
2.52.0


From c5130cb15ceb7e82febd0b549af0006264733182 Mon Sep 17 00:00:00 2001
From: gx <gx@goldix.org>
Date: Tue, 19 May 2026 22:41:11 +0100
Subject: [PATCH 12/14] =?UTF-8?q?vf=5Fcuda=5Fgrid:=20Phase=204b-4=20?=
 =?UTF-8?q?=E2=80=94=20icon=20overlay=20(PNG/JPG=20decode=20+=20sprite=20b?=
 =?UTF-8?q?lit)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PNG/JPG decode через libavcodec (системный decoder), conversion в RGBA8
через swscale, upload в pitched CUDA buffer, blit через Alpha_Blit_RGBA_Y/UV
(те же kernels что text).

Icon cache — shared by name (одна и та же иконка для нескольких overlays =
один GPU atlas). Lifecycle: load on first use, free в uninit.

Path resolution: <icon_dir>/<name> + try .png/.jpg extensions:
  /var/lib/cuda-grid/icons  (default)
  /opt/cuda-grid/icons      (fallback)
  + abs path supported

Options:
  icon_dir=  — overrides default search paths

Wire format: add_overlay <id> icon x=.. y=.. icon_name=domofon opacity=200

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 libavfilter/vf_cuda_grid.c | 273 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 272 insertions(+), 1 deletion(-)

diff --git a/libavfilter/vf_cuda_grid.c b/libavfilter/vf_cuda_grid.c
index 1a45eb1..32c4922 100644
--- a/libavfilter/vf_cuda_grid.c
+++ b/libavfilter/vf_cuda_grid.c
@@ -50,6 +50,10 @@
 #include FT_FREETYPE_H
 #endif
 
+#include "libavcodec/avcodec.h"
+#include "libavformat/avformat.h"
+#include "libswscale/swscale.h"
+
 #include "avfilter.h"
 #include "cuda/load_helper.h"
 #include "filters.h"
@@ -191,6 +195,17 @@ static const LayoutTemplate *find_layout(const char *name)
 
 /* ─── Filter state ─────────────────────────────────────────────────────── */
 
+/* Icon atlas — RGBA buffer на GPU, keyed by icon_name (Phase 4b-4, shared across overlays). */
+#define OVERLAY_ICON_NAME_MAX 32
+#define MAX_ICON_ATLASES 32
+
+typedef struct IconAtlas {
+    char        icon_name[OVERLAY_ICON_NAME_MAX];
+    CUdeviceptr device_ptr;
+    size_t      device_pitch;
+    int         w, h;
+} IconAtlas;
+
 /* Text atlas — rasterized RGBA buffer on GPU, keyed by overlay id (Phase 4b-3). */
 typedef struct TextAtlas {
     char        id[OVERLAY_ID_MAX];
@@ -211,6 +226,7 @@ typedef struct CudaGridContext {
     int   out_height;
     char *font_file;
     int   default_font_size;
+    char *icon_dir;
 
     /* Resolved layout (после init) */
     const LayoutTemplate *layout;
@@ -249,6 +265,10 @@ typedef struct CudaGridContext {
 #endif
     TextAtlas   atlases[MAX_OVERLAYS];
     int         nb_atlases;
+
+    /* Icon atlas cache (Phase 4b-4) */
+    IconAtlas   icon_atlases[MAX_ICON_ATLASES];
+    int         nb_icon_atlases;
 } CudaGridContext;
 
 /* ─── Composition: copy одного input plane в target region output ──────── */
@@ -857,6 +877,247 @@ static int render_overlay_text(AVFilterContext *ctx, AVFrame *out, const GridOve
 
 #endif  /* CONFIG_LIBFREETYPE */
 
+/* ─── Icon rendering (Phase 4b-4, PNG via libavcodec + swscale) ────────── */
+
+static const char *const default_icon_dirs[] = {
+    "/var/lib/cuda-grid/icons",
+    "/opt/cuda-grid/icons",
+    NULL,
+};
+
+/* Read image file, decode (PNG/JPEG/etc.), convert to packed RGBA8.
+ * Returns malloc'd buffer (caller frees) + dims, or NULL on error. */
+static uint8_t *load_image_to_rgba_cpu(AVFilterContext *ctx, const char *path,
+                                       int *out_w, int *out_h)
+{
+    AVFormatContext *fmt_ctx = NULL;
+    AVCodecContext  *cctx    = NULL;
+    AVFrame         *frame   = NULL;
+    AVPacket        *pkt     = NULL;
+    struct SwsContext *sws   = NULL;
+    const AVCodec   *codec;
+    int stream_idx;
+    uint8_t *rgba = NULL;
+    int rgba_pitch;
+    int ret;
+
+    if (avformat_open_input(&fmt_ctx, path, NULL, NULL) < 0) {
+        av_log(ctx, AV_LOG_WARNING, "icon: open %s failed\n", path);
+        return NULL;
+    }
+    if (avformat_find_stream_info(fmt_ctx, NULL) < 0) goto fail;
+
+    stream_idx = av_find_best_stream(fmt_ctx, AVMEDIA_TYPE_VIDEO, -1, -1, NULL, 0);
+    if (stream_idx < 0) goto fail;
+
+    codec = avcodec_find_decoder(fmt_ctx->streams[stream_idx]->codecpar->codec_id);
+    if (!codec) goto fail;
+
+    cctx = avcodec_alloc_context3(codec);
+    if (!cctx) goto fail;
+    if (avcodec_parameters_to_context(cctx, fmt_ctx->streams[stream_idx]->codecpar) < 0) goto fail;
+    if (avcodec_open2(cctx, codec, NULL) < 0) goto fail;
+
+    pkt   = av_packet_alloc();
+    frame = av_frame_alloc();
+    if (!pkt || !frame) goto fail;
+
+    if (av_read_frame(fmt_ctx, pkt) < 0) goto fail;
+    if (avcodec_send_packet(cctx, pkt) < 0) goto fail;
+    ret = avcodec_receive_frame(cctx, frame);
+    if (ret < 0) goto fail;
+
+    sws = sws_getContext(frame->width, frame->height, frame->format,
+                         frame->width, frame->height, AV_PIX_FMT_RGBA,
+                         SWS_BILINEAR, NULL, NULL, NULL);
+    if (!sws) goto fail;
+
+    rgba_pitch = frame->width * 4;
+    rgba = av_malloc((size_t)rgba_pitch * frame->height);
+    if (!rgba) goto fail;
+
+    {
+        uint8_t *out_data[4]  = { rgba, NULL, NULL, NULL };
+        int      out_pitch[4] = { rgba_pitch, 0, 0, 0 };
+        sws_scale(sws, (const uint8_t * const *)frame->data, frame->linesize,
+                  0, frame->height, out_data, out_pitch);
+    }
+
+    *out_w = frame->width;
+    *out_h = frame->height;
+
+fail:
+    if (sws)     sws_freeContext(sws);
+    if (frame)   av_frame_free(&frame);
+    if (pkt)     av_packet_free(&pkt);
+    if (cctx)    avcodec_free_context(&cctx);
+    if (fmt_ctx) avformat_close_input(&fmt_ctx);
+    return rgba;
+}
+
+/* Resolve icon name к full path (search dirs + extensions). */
+static int resolve_icon_path(AVFilterContext *ctx, const char *icon_name,
+                             char *out_path, size_t out_path_sz)
+{
+    CudaGridContext *s = ctx->priv;
+    static const char *const exts[] = { "", ".png", ".jpg", NULL };
+    const char *dirs[8] = { 0 };
+    int i, j, ndir = 0;
+
+    /* Если absolute path — use as-is */
+    if (icon_name[0] == '/') {
+        if (access(icon_name, R_OK) == 0) {
+            av_strlcpy(out_path, icon_name, out_path_sz);
+            return 0;
+        }
+        return AVERROR(ENOENT);
+    }
+
+    if (s->icon_dir) dirs[ndir++] = s->icon_dir;
+    for (i = 0; default_icon_dirs[i] && ndir < 7; i++) dirs[ndir++] = default_icon_dirs[i];
+
+    for (i = 0; i < ndir; i++) {
+        for (j = 0; exts[j]; j++) {
+            snprintf(out_path, out_path_sz, "%s/%s%s", dirs[i], icon_name, exts[j]);
+            if (access(out_path, R_OK) == 0) return 0;
+        }
+    }
+    return AVERROR(ENOENT);
+}
+
+static IconAtlas *icon_atlas_find_locked(CudaGridContext *s, const char *name)
+{
+    int i;
+    for (i = 0; i < s->nb_icon_atlases; i++)
+        if (!strcmp(s->icon_atlases[i].icon_name, name))
+            return &s->icon_atlases[i];
+    return NULL;
+}
+
+/* Ensure GPU atlas for icon_name loaded. cu ctx pushed by caller. */
+static int ensure_icon_atlas(AVFilterContext *ctx, const char *icon_name, IconAtlas **out)
+{
+    CudaGridContext *s = ctx->priv;
+    IconAtlas *a;
+    uint8_t *cpu_rgba;
+    char path[512];
+    int aw, ah;
+    CUdeviceptr dev_ptr = 0;
+    size_t dev_pitch = 0;
+    CUDA_MEMCPY2D cpy = { 0 };
+    int ret;
+
+    a = icon_atlas_find_locked(s, icon_name);
+    if (a && a->device_ptr) { *out = a; return 0; }
+    if (s->nb_icon_atlases >= MAX_ICON_ATLASES && !a) return AVERROR(ENOSPC);
+
+    ret = resolve_icon_path(ctx, icon_name, path, sizeof(path));
+    if (ret < 0) {
+        av_log(ctx, AV_LOG_WARNING, "icon '%s' not found in search paths\n", icon_name);
+        return ret;
+    }
+
+    cpu_rgba = load_image_to_rgba_cpu(ctx, path, &aw, &ah);
+    if (!cpu_rgba) return AVERROR(EINVAL);
+
+    ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuMemAllocPitch(
+        &dev_ptr, &dev_pitch, (size_t)aw * 4, (size_t)ah, 4));
+    if (ret < 0) { av_free(cpu_rgba); return ret; }
+
+    cpy.srcMemoryType = CU_MEMORYTYPE_HOST;
+    cpy.srcHost       = cpu_rgba;
+    cpy.srcPitch      = (size_t)aw * 4;
+    cpy.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+    cpy.dstDevice     = dev_ptr;
+    cpy.dstPitch      = dev_pitch;
+    cpy.WidthInBytes  = (size_t)aw * 4;
+    cpy.Height        = (size_t)ah;
+    ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuMemcpy2D(&cpy));
+    av_free(cpu_rgba);
+    if (ret < 0) {
+        s->hwctx->internal->cuda_dl->cuMemFree(dev_ptr);
+        return ret;
+    }
+
+    if (!a) {
+        a = &s->icon_atlases[s->nb_icon_atlases++];
+        memset(a, 0, sizeof(*a));
+        av_strlcpy(a->icon_name, icon_name, sizeof(a->icon_name));
+    }
+    a->device_ptr   = dev_ptr;
+    a->device_pitch = dev_pitch;
+    a->w = aw;
+    a->h = ah;
+    *out = a;
+    av_log(ctx, AV_LOG_INFO, "icon '%s' loaded %s (%dx%d)\n", icon_name, path, aw, ah);
+    return 0;
+}
+
+static int render_overlay_icon(AVFilterContext *ctx, AVFrame *out, const GridOverlay *ov)
+{
+    CudaGridContext *s = ctx->priv;
+    IconAtlas *a = NULL;
+    int px, py;
+    int base_x, base_y, base_w, base_h;
+    int aw, ah, ap, ealpha;
+    CUdeviceptr dst_y, dst_uv;
+    int ret;
+
+    if (!ov->u.icon.icon_name[0]) return 0;
+    ret = ensure_icon_atlas(ctx, ov->u.icon.icon_name, &a);
+    if (ret < 0 || !a) return 0;  /* non-fatal — skip */
+
+    if (ov->cell < 0) {
+        base_x = 0; base_y = 0;
+        base_w = s->out_width; base_h = s->out_height;
+    } else if (ov->cell < s->layout->nb_cells) {
+        base_x = s->cell_px[ov->cell].x;
+        base_y = s->cell_px[ov->cell].y;
+        base_w = s->cell_px[ov->cell].w;
+        base_h = s->cell_px[ov->cell].h;
+    } else return 0;
+
+    px = base_x + (int)(ov->x * base_w);
+    py = base_y + (int)(ov->y * base_h);
+    px &= ~1; py &= ~1;
+    if (px >= s->out_width || py >= s->out_height) return 0;
+
+    aw = a->w; ah = a->h;
+    if (px + aw > s->out_width)  aw = s->out_width  - px;
+    if (py + ah > s->out_height) ah = s->out_height - py;
+    aw &= ~1; ah &= ~1;
+    if (aw <= 0 || ah <= 0) return 0;
+
+    ap = (int)a->device_pitch;
+    ealpha = ov->opacity;
+
+    dst_y = (CUdeviceptr)out->data[0];
+    {
+        int dst_pitch_y = out->linesize[0];
+        CUdeviceptr atl = a->device_ptr;
+        void *args[] = { &dst_y, &dst_pitch_y, &px, &py, &atl, &ap, &aw, &ah, &ealpha };
+        ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuLaunchKernel(
+            s->cu_func_alpha_blit_rgba_y,
+            DIV_UP(aw, BLOCKX), DIV_UP(ah, BLOCKY), 1,
+            BLOCKX, BLOCKY, 1,
+            0, s->cu_stream, args, NULL));
+        if (ret < 0) return ret;
+    }
+    dst_uv = (CUdeviceptr)out->data[1];
+    {
+        int dst_pitch_uv = out->linesize[1];
+        CUdeviceptr atl = a->device_ptr;
+        void *args[] = { &dst_uv, &dst_pitch_uv, &px, &py, &atl, &ap, &aw, &ah, &ealpha };
+        ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuLaunchKernel(
+            s->cu_func_alpha_blit_rgba_uv,
+            DIV_UP(aw / 2, BLOCKX), DIV_UP(ah / 2, BLOCKY), 1,
+            BLOCKX, BLOCKY, 1,
+            0, s->cu_stream, args, NULL));
+        if (ret < 0) return ret;
+    }
+    return 0;
+}
+
 static int render_overlay_rect(AVFilterContext *ctx, AVFrame *out, const GridOverlay *ov)
 {
     CudaGridContext *s = ctx->priv;
@@ -947,7 +1208,8 @@ static int render_overlays(AVFilterContext *ctx, AVFrame *out)
 #endif
             break;
         case OV_TYPE_ICON:
-            av_log(ctx, AV_LOG_TRACE, "overlay %s: icon — Phase 4b-4 (sprite)\n", ov->id);
+            ret = render_overlay_icon(ctx, out, ov);
+            if (ret < 0) return ret;
             break;
         }
     }
@@ -1107,6 +1369,13 @@ static av_cold void cuda_grid_uninit(AVFilterContext *ctx)
             }
         }
         s->nb_atlases = 0;
+        for (i = 0; i < s->nb_icon_atlases; i++) {
+            if (s->icon_atlases[i].device_ptr) {
+                s->hwctx->internal->cuda_dl->cuMemFree(s->icon_atlases[i].device_ptr);
+                s->icon_atlases[i].device_ptr = 0;
+            }
+        }
+        s->nb_icon_atlases = 0;
         if (s->cu_module) {
             CHECK_CU(s->hwctx->internal->cuda_dl->cuModuleUnload(s->cu_module));
             s->cu_module = NULL;
@@ -1340,6 +1609,8 @@ static const AVOption cuda_grid_options[] = {
       OFFSET(font_file), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, FLAGS },
     { "font_size", "default text overlay size (px), используется если overlay не указал свой",
       OFFSET(default_font_size), AV_OPT_TYPE_INT, { .i64 = 24 }, 6, 256, FLAGS },
+    { "icon_dir", "directory для resolve icon_name (по умолчанию /var/lib/cuda-grid/icons)",
+      OFFSET(icon_dir), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, FLAGS },
     { NULL }
 };
 
-- 
2.52.0


From a326ef146ca510eb0a5d615e5cc33f731b5d79d3 Mon Sep 17 00:00:00 2001
From: gx <gx@goldix.org>
Date: Wed, 20 May 2026 21:48:09 +0100
Subject: [PATCH 13/14] =?UTF-8?q?vf=5Fcuda=5Fgrid:=20Phase=206=20=E2=80=94?=
 =?UTF-8?q?=20process=5Fcommand=20'reload=5Ficon=20<name>'?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Invalidate cached icon atlas by name. Next render автоматически re-reads PNG
file с disk. Used controller'ом для periodic re-render dynamic overlays
(charts/chats) — write new PNG → send reload_icon → filter подхватывает.

Без этого pattern dynamic overlay невозможен — icon overlay cached forever
after first load by icon_name.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 libavfilter/vf_cuda_grid.c | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/libavfilter/vf_cuda_grid.c b/libavfilter/vf_cuda_grid.c
index 32c4922..3bb2852 100644
--- a/libavfilter/vf_cuda_grid.c
+++ b/libavfilter/vf_cuda_grid.c
@@ -1450,6 +1450,39 @@ static int cuda_grid_process_command(AVFilterContext *ctx, const char *cmd,
         return 0;
     }
 
+    if (!strcmp(cmd, "reload_icon")) {
+        /* Invalidate cached icon atlas by name — next render re-reads file from disk.
+         * Used Phase 6 by controller для periodic chart/chat re-rendering. */
+        char name[OVERLAY_ICON_NAME_MAX];
+        int i, found = -1;
+        if (!arg || sscanf(arg, "%31s", name) != 1) {
+            if (res) av_strlcpy(res, "err parse", res_len);
+            return AVERROR(EINVAL);
+        }
+        pthread_mutex_lock(&s->overlay_lock);
+        if (s->hwctx) {
+            CUcontext dummy;
+            CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPushCurrent(s->cu_ctx));
+            for (i = 0; i < s->nb_icon_atlases; i++) {
+                if (!strcmp(s->icon_atlases[i].icon_name, name)) {
+                    if (s->icon_atlases[i].device_ptr) {
+                        s->hwctx->internal->cuda_dl->cuMemFree(s->icon_atlases[i].device_ptr);
+                        s->icon_atlases[i].device_ptr = 0;
+                    }
+                    memmove(&s->icon_atlases[i], &s->icon_atlases[i + 1],
+                            (s->nb_icon_atlases - i - 1) * sizeof(IconAtlas));
+                    s->nb_icon_atlases--;
+                    found = i;
+                    break;
+                }
+            }
+            CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
+        }
+        pthread_mutex_unlock(&s->overlay_lock);
+        if (res) snprintf(res, res_len, found >= 0 ? "ok %s reloaded" : "not_cached %s", name);
+        return 0;
+    }
+
     /* Future: set_layout (Phase 3 на filter side — пока nb_inputs хардкодится в init) */
 
     /* Fall back to standard option/command handling */
-- 
2.52.0


From 636bd7885410539595ef45c7e8dc36461ad04686 Mon Sep 17 00:00:00 2001
From: gx <gx@goldix.org>
Date: Thu, 21 May 2026 07:08:23 +0100
Subject: [PATCH 14/14] =?UTF-8?q?vf=5Fcuda=5Fgrid:=20overlay=20=D1=81=20in?=
 =?UTF-8?q?valid=20cell=20=E2=80=94=20silent=20skip=20=D0=B2=D0=BC=D0=B5?=
 =?UTF-8?q?=D1=81=D1=82=D0=BE=20AVERROR?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Когда controller broadcast'ит overlay command ко всем cuda_grid instances
(quad/single/mpp), overlay с cell=2 (gate_lpr cell в quad/mpp) попадает
в single layout который имеет nb_cells=1 → overlay_pixel_rect возвращал
AVERROR(EINVAL) → render_overlays propagated к cuda_grid_compose →
"Error while filtering: Invalid argument" каждый frame → pipeline crash loop.

Fix: return 1 (positive skip) вместо AVERROR. render_overlay_rect уже
имеет `if (ret != 0) return ret < 0 ? ret : 0;` — positive skip правильно
обрабатывается без error.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 libavfilter/vf_cuda_grid.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/libavfilter/vf_cuda_grid.c b/libavfilter/vf_cuda_grid.c
index 3bb2852..6367239 100644
--- a/libavfilter/vf_cuda_grid.c
+++ b/libavfilter/vf_cuda_grid.c
@@ -476,7 +476,10 @@ static int overlay_pixel_rect(CudaGridContext *s, const GridOverlay *ov,
         base_w = s->cell_px[ov->cell].w;
         base_h = s->cell_px[ov->cell].h;
     } else {
-        return AVERROR(EINVAL);
+        /* Cell index out of range — silently skip (overlay broadcast'нет ко всем
+         * cuda_grid instances, layouts с меньшим nb_cells). NOT error чтобы
+         * не валить filter chain. */
+        return 1;
     }
 
     rx = base_x + (int)(ov->x * base_w);
-- 
2.52.0