diff --git a/configure b/configure
index c24aa94..b94029e 100755
--- a/configure
+++ b/configure
@@ -3317,6 +3317,8 @@ thumbnail_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 transpose_npp_filter_deps="ffnvcodec libnpp"
 overlay_cuda_filter_deps="ffnvcodec"
 overlay_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
+cuda_grid_filter_deps="ffnvcodec"
+cuda_grid_filter_deps_any="cuda_nvcc cuda_llvm"
 sharpen_npp_filter_deps="ffnvcodec libnpp"
 
 ddagrab_filter_deps="d3d11va IDXGIOutput1 DXGI_OUTDUPL_FRAME_INFO"
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index 91487af..f570821 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -410,6 +410,8 @@ OBJS-$(CONFIG_OSCILLOSCOPE_FILTER)           += vf_datascope.o
 OBJS-$(CONFIG_OVERLAY_FILTER)                += vf_overlay.o framesync.o
 OBJS-$(CONFIG_OVERLAY_CUDA_FILTER)           += vf_overlay_cuda.o framesync.o vf_overlay_cuda.ptx.o \
                                                 cuda/load_helper.o
+OBJS-$(CONFIG_CUDA_GRID_FILTER)              += vf_cuda_grid.o framesync.o \
+                                                vf_cuda_grid.ptx.o cuda/load_helper.o
 OBJS-$(CONFIG_OVERLAY_OPENCL_FILTER)         += vf_overlay_opencl.o opencl.o \
                                                 opencl/overlay.o framesync.o
 OBJS-$(CONFIG_OVERLAY_QSV_FILTER)            += vf_overlay_qsv.o framesync.o
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index 9819f0f..bc0d00a 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -390,6 +390,7 @@ extern const AVFilter ff_vf_overlay_qsv;
 extern const AVFilter ff_vf_overlay_vaapi;
 extern const AVFilter ff_vf_overlay_vulkan;
 extern const AVFilter ff_vf_overlay_cuda;
+extern const AVFilter ff_vf_cuda_grid;
 extern const AVFilter ff_vf_owdenoise;
 extern const AVFilter ff_vf_pad;
 extern const AVFilter ff_vf_pad_opencl;
diff --git a/libavfilter/vf_cuda_grid.c b/libavfilter/vf_cuda_grid.c
new file mode 100644
index 0000000..6367239
--- /dev/null
+++ b/libavfilter/vf_cuda_grid.c
@@ -0,0 +1,1679 @@
+/*
+ * cuda_grid — GPU-native video grid composer для FFmpeg 7.x.
+ *
+ * Принимает N CUDA-frames на входе, выдаёт один composed frame с N cells
+ * в layout. End-to-end CUDA (без CPU round-trip).
+ *
+ * Phase 2: layout templates (single/dual_h/dual_v/quad/main_plus_preview/
+ * six_grid/nine_grid/sixteen_grid/panoramic), dynamic nb_inputs, output size
+ * через option (default 1920×1080). Cell rects = normalized × output size.
+ *
+ * **Scaling delegated to upstream `scale_npp`** filter (Unix philosophy +
+ * production-tested NPP code). NPP не имеет nppiResize_8u_C2R для NV12 UV
+ * interleaved, поэтому in-filter scaling = either two intermediate plane
+ * buffers либо custom CUDA kernel — оба больше work чем filter chain'ить:
+ *
+ *   ffmpeg ... -filter_complex \
+ *     "[0]scale_npp=1280:1080[s0]; \
+ *      [1]scale_npp=640:360[s1]; \
+ *      [2]scale_npp=640:360[s2]; \
+ *      [3]scale_npp=640:360[s3]; \
+ *      [s0][s1][s2][s3]cuda_grid=layout=main_plus_preview[out]"
+ *
+ * Controller (Phase 3) auto-generates filter graph с scale_npp per input.
+ *
+ * Future phases (см. gx/vf-cuda-grid#1):
+ *  - Phase 3: runtime layout switching через process_command (ZMQ)
+ *  - Phase 4+: overlay primitives (rect/text/icon/image/dim/graph/chat)
+ *
+ * Лицензия: LGPL-2.1+ (соответствует FFmpeg)
+ */
+
+#include "config_components.h"
+
+#include <unistd.h>
+
+#include "libavutil/avstring.h"
+#include "libavutil/common.h"
+#include "libavutil/cuda_check.h"
+#include "libavutil/hwcontext.h"
+#include "libavutil/hwcontext_cuda_internal.h"
+#include "libavutil/log.h"
+#include "libavutil/mem.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/thread.h"
+
+#include "config.h"
+#if CONFIG_LIBFREETYPE
+#include <ft2build.h>
+#include FT_FREETYPE_H
+#endif
+
+#include "libavcodec/avcodec.h"
+#include "libavformat/avformat.h"
+#include "libswscale/swscale.h"
+
+#include "avfilter.h"
+#include "cuda/load_helper.h"
+#include "filters.h"
+#include "formats.h"
+#include "framesync.h"
+#include "video.h"
+
+extern const unsigned char ff_vf_cuda_grid_ptx_data[];
+extern const unsigned int  ff_vf_cuda_grid_ptx_len;
+
+#define BLOCKX 16
+#define BLOCKY 16
+#define DIV_UP(a, b) (((a) + (b) - 1) / (b))
+
+#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x)
+#define MAX_CELLS    16
+#define MAX_OVERLAYS 64
+#define OVERLAY_ID_MAX 32
+#define OVERLAY_TEXT_MAX 128
+
+/* ─── Overlay primitives (Phase 4b-1: rect only, no alpha) ─────────────── */
+
+typedef enum {
+    OV_TYPE_RECT = 0,
+    OV_TYPE_TEXT,   /* Phase 4b-3 */
+    OV_TYPE_ICON,   /* Phase 4b-4 */
+    OV_TYPE_DIM,    /* Phase 4b-2 (alpha kernel) */
+} GridOverlayType;
+
+typedef struct GridOverlay {
+    char            id[OVERLAY_ID_MAX];
+    GridOverlayType type;
+    int             cell;       /* -1 = absolute on output, иначе cell-relative */
+    float           x, y, w, h; /* [0..1] normalized; relative к cell или output */
+    int             z_order;    /* меньше → рисуется первым */
+    uint8_t         opacity;    /* 0..255, реально применяется в Phase 4b-2 */
+    int             visible;
+    union {
+        struct {
+            uint8_t r, g, b;
+            int     thickness;  /* 0 = filled, >0 = border (border via 4 strips) */
+        } rect;
+        struct {
+            char    text[OVERLAY_TEXT_MAX];
+            int     font_size;
+            uint8_t r, g, b;
+        } text;
+        struct {
+            char    icon_name[32];
+        } icon;
+        struct {
+            uint8_t amount; /* 0..255, alpha for dim */
+        } dim;
+    } u;
+} GridOverlay;
+
+/* ─── Layout templates (normalized координаты 0.0–1.0) ─────────────────── */
+
+typedef struct LayoutCell {
+    float x, y, w, h;   /* normalized fraction of output size */
+} LayoutCell;
+
+typedef struct LayoutTemplate {
+    const char *name;
+    int nb_cells;
+    LayoutCell cells[MAX_CELLS];
+} LayoutTemplate;
+
+/* Layouts copy-paste'нуты по структуре cctv-processor config/grids.json. */
+static const LayoutTemplate layouts[] = {
+    {
+        "single", 1,
+        { {0.0f, 0.0f, 1.0f, 1.0f} }
+    },
+    {
+        "dual_horizontal", 2,
+        { {0.0f, 0.0f, 0.5f, 1.0f}, {0.5f, 0.0f, 0.5f, 1.0f} }
+    },
+    {
+        "dual_vertical", 2,
+        { {0.0f, 0.0f, 1.0f, 0.5f}, {0.0f, 0.5f, 1.0f, 0.5f} }
+    },
+    {
+        "quad", 4,
+        {
+            {0.0f, 0.0f, 0.5f, 0.5f}, {0.5f, 0.0f, 0.5f, 0.5f},
+            {0.0f, 0.5f, 0.5f, 0.5f}, {0.5f, 0.5f, 0.5f, 0.5f},
+        }
+    },
+    {
+        /* Main camera 2/3 width, 3 small cameras stacked справа сверху вниз */
+        "main_plus_preview", 4,
+        {
+            {0.0f,      0.0f, 2.0f/3, 1.0f},
+            {2.0f/3,    0.0f, 1.0f/3, 1.0f/3},
+            {2.0f/3, 1.0f/3, 1.0f/3, 1.0f/3},
+            {2.0f/3, 2.0f/3, 1.0f/3, 1.0f/3},
+        }
+    },
+    {
+        "six_grid", 6,
+        {
+            {0.0f, 0.0f, 1.0f/3, 0.5f}, {1.0f/3, 0.0f, 1.0f/3, 0.5f}, {2.0f/3, 0.0f, 1.0f/3, 0.5f},
+            {0.0f, 0.5f, 1.0f/3, 0.5f}, {1.0f/3, 0.5f, 1.0f/3, 0.5f}, {2.0f/3, 0.5f, 1.0f/3, 0.5f},
+        }
+    },
+    {
+        "nine_grid", 9,
+        {
+            {0.0f, 0.0f, 1.0f/3, 1.0f/3}, {1.0f/3, 0.0f, 1.0f/3, 1.0f/3}, {2.0f/3, 0.0f, 1.0f/3, 1.0f/3},
+            {0.0f, 1.0f/3, 1.0f/3, 1.0f/3}, {1.0f/3, 1.0f/3, 1.0f/3, 1.0f/3}, {2.0f/3, 1.0f/3, 1.0f/3, 1.0f/3},
+            {0.0f, 2.0f/3, 1.0f/3, 1.0f/3}, {1.0f/3, 2.0f/3, 1.0f/3, 1.0f/3}, {2.0f/3, 2.0f/3, 1.0f/3, 1.0f/3},
+        }
+    },
+    {
+        "sixteen_grid", 16,
+        {
+            {0.00f, 0.00f, 0.25f, 0.25f}, {0.25f, 0.00f, 0.25f, 0.25f}, {0.50f, 0.00f, 0.25f, 0.25f}, {0.75f, 0.00f, 0.25f, 0.25f},
+            {0.00f, 0.25f, 0.25f, 0.25f}, {0.25f, 0.25f, 0.25f, 0.25f}, {0.50f, 0.25f, 0.25f, 0.25f}, {0.75f, 0.25f, 0.25f, 0.25f},
+            {0.00f, 0.50f, 0.25f, 0.25f}, {0.25f, 0.50f, 0.25f, 0.25f}, {0.50f, 0.50f, 0.25f, 0.25f}, {0.75f, 0.50f, 0.25f, 0.25f},
+            {0.00f, 0.75f, 0.25f, 0.25f}, {0.25f, 0.75f, 0.25f, 0.25f}, {0.50f, 0.75f, 0.25f, 0.25f}, {0.75f, 0.75f, 0.25f, 0.25f},
+        }
+    },
+    {
+        /* Один widescreen panoramic — 1 cell full width */
+        "panoramic", 1,
+        { {0.0f, 0.0f, 1.0f, 1.0f} }
+    },
+};
+
+static const LayoutTemplate *find_layout(const char *name)
+{
+    for (size_t i = 0; i < FF_ARRAY_ELEMS(layouts); i++) {
+        if (!strcmp(layouts[i].name, name))
+            return &layouts[i];
+    }
+    return NULL;
+}
+
+/* ─── Filter state ─────────────────────────────────────────────────────── */
+
+/* Icon atlas — RGBA buffer на GPU, keyed by icon_name (Phase 4b-4, shared across overlays). */
+#define OVERLAY_ICON_NAME_MAX 32
+#define MAX_ICON_ATLASES 32
+
+typedef struct IconAtlas {
+    char        icon_name[OVERLAY_ICON_NAME_MAX];
+    CUdeviceptr device_ptr;
+    size_t      device_pitch;
+    int         w, h;
+} IconAtlas;
+
+/* Text atlas — rasterized RGBA buffer on GPU, keyed by overlay id (Phase 4b-3). */
+typedef struct TextAtlas {
+    char        id[OVERLAY_ID_MAX];
+    char        cached_text[OVERLAY_TEXT_MAX];
+    int         cached_font_size;
+    uint8_t     cached_r, cached_g, cached_b;
+    CUdeviceptr device_ptr;
+    size_t      device_pitch;
+    int         w, h;
+} TextAtlas;
+
+typedef struct CudaGridContext {
+    const AVClass *class;
+
+    /* Options */
+    char *layout_name;
+    int   out_width;
+    int   out_height;
+    char *font_file;
+    int   default_font_size;
+    char *icon_dir;
+
+    /* Resolved layout (после init) */
+    const LayoutTemplate *layout;
+
+    /* CUDA */
+    AVCUDADeviceContext *hwctx;
+    CUcontext cu_ctx;
+    CUstream  cu_stream;
+
+    FFFrameSync fs;
+
+    /* Per-cell pixel rects (computed в config_output из normalized × out size) */
+    struct {
+        int x, y, w, h;
+    } cell_px[MAX_CELLS];
+
+    /* Overlay state (Phase 4b) — mutex-guarded, process_command/render thread-safe */
+    GridOverlay     overlays[MAX_OVERLAYS];
+    int             nb_overlays;
+    pthread_mutex_t overlay_lock;
+    int             overlay_lock_inited;
+
+    /* CUDA kernels (Phase 4b-2) — loaded from embedded .ptx */
+    CUmodule    cu_module;
+    CUfunction  cu_func_alpha_fill_y;
+    CUfunction  cu_func_alpha_fill_uv;
+    CUfunction  cu_func_alpha_blit_rgba_y;
+    CUfunction  cu_func_alpha_blit_rgba_uv;
+
+    /* Text rendering (Phase 4b-3, libfreetype optional) */
+#if CONFIG_LIBFREETYPE
+    FT_Library  ft_lib;
+    FT_Face     ft_face;
+    int         ft_ready;          /* 1 once library + face loaded */
+    int         ft_init_attempted; /* set on first attempt, success or fail */
+#endif
+    TextAtlas   atlases[MAX_OVERLAYS];
+    int         nb_atlases;
+
+    /* Icon atlas cache (Phase 4b-4) */
+    IconAtlas   icon_atlases[MAX_ICON_ATLASES];
+    int         nb_icon_atlases;
+} CudaGridContext;
+
+/* ─── Composition: copy одного input plane в target region output ──────── */
+
+static int copy_input_plane(AVFilterContext *ctx,
+                            CUdeviceptr     src_data,
+                            int             src_pitch,
+                            int             src_w,
+                            int             src_h,
+                            CUdeviceptr     dst_data,
+                            int             dst_pitch,
+                            int             dst_x,
+                            int             dst_y,
+                            int             bytes_per_pixel)
+{
+    CudaGridContext *s = ctx->priv;
+    CUDA_MEMCPY2D cpy = {
+        .srcMemoryType = CU_MEMORYTYPE_DEVICE,
+        .srcDevice     = src_data,
+        .srcPitch      = src_pitch,
+        .dstMemoryType = CU_MEMORYTYPE_DEVICE,
+        .dstDevice     = dst_data,
+        .dstXInBytes   = (size_t)dst_x * bytes_per_pixel,
+        .dstY          = dst_y,
+        .dstPitch      = dst_pitch,
+        .WidthInBytes  = (size_t)src_w * bytes_per_pixel,
+        .Height        = src_h,
+    };
+    return CHECK_CU(s->hwctx->internal->cuda_dl->cuMemcpy2DAsync(&cpy, s->cu_stream));
+}
+
+/* ─── Overlay parsing + rendering (Phase 4b-1: rect, solid fill, no alpha) ─ */
+
+/* BT.709 limited-range RGB → YUV. Достаточно для HDTV (любое 1920×1080 output). */
+static av_always_inline void rgb_to_yuv709(uint8_t r, uint8_t g, uint8_t b,
+                                            uint8_t *Y, uint8_t *U, uint8_t *V)
+{
+    int y = (int)( 0.183f * r + 0.614f * g + 0.062f * b) + 16;
+    int u = (int)(-0.101f * r - 0.339f * g + 0.439f * b) + 128;
+    int v = (int)( 0.439f * r - 0.399f * g - 0.040f * b) + 128;
+    *Y = av_clip_uint8(y);
+    *U = av_clip_uint8(u);
+    *V = av_clip_uint8(v);
+}
+
+/* In-place URL-decode (%20 etc.) — for string values приходящих от controller. */
+static void url_decode_inplace(char *s)
+{
+    char *r = s, *w = s;
+    while (*r) {
+        if (r[0] == '%' && r[1] && r[2]) {
+            int hi = -1, lo = -1;
+            char c1 = r[1], c2 = r[2];
+            if (c1 >= '0' && c1 <= '9') hi = c1 - '0';
+            else if (c1 >= 'A' && c1 <= 'F') hi = c1 - 'A' + 10;
+            else if (c1 >= 'a' && c1 <= 'f') hi = c1 - 'a' + 10;
+            if (c2 >= '0' && c2 <= '9') lo = c2 - '0';
+            else if (c2 >= 'A' && c2 <= 'F') lo = c2 - 'A' + 10;
+            else if (c2 >= 'a' && c2 <= 'f') lo = c2 - 'a' + 10;
+            if (hi >= 0 && lo >= 0) {
+                *w++ = (char)((hi << 4) | lo);
+                r += 3;
+                continue;
+            }
+        }
+        *w++ = *r++;
+    }
+    *w = '\0';
+}
+
+/* Parse args в формате: "<id> <type> <key>=<val> <key>=<val> ..."
+ * String values приходят URL-encoded (%20 = space) от controller.
+ * Out *ov заполняется defaults + parsed values. Returns 0 / AVERROR. */
+static int parse_overlay_args(AVFilterContext *ctx, const char *args, GridOverlay *ov)
+{
+    char id[OVERLAY_ID_MAX], type_str[16];
+    const char *p;
+
+    if (!args)
+        return AVERROR(EINVAL);
+
+    if (sscanf(args, "%31s %15s", id, type_str) != 2) {
+        av_log(ctx, AV_LOG_ERROR, "overlay args: expected '<id> <type> ...', got: %s\n", args);
+        return AVERROR(EINVAL);
+    }
+
+    memset(ov, 0, sizeof(*ov));
+    av_strlcpy(ov->id, id, sizeof(ov->id));
+
+    if      (!strcmp(type_str, "rect")) ov->type = OV_TYPE_RECT;
+    else if (!strcmp(type_str, "text")) ov->type = OV_TYPE_TEXT;
+    else if (!strcmp(type_str, "icon")) ov->type = OV_TYPE_ICON;
+    else if (!strcmp(type_str, "dim"))  ov->type = OV_TYPE_DIM;
+    else {
+        av_log(ctx, AV_LOG_ERROR, "unknown overlay type: %s\n", type_str);
+        return AVERROR(EINVAL);
+    }
+
+    /* Defaults */
+    ov->cell    = -1;
+    ov->opacity = 255;
+    ov->visible = 1;
+    if (ov->type == OV_TYPE_RECT)
+        ov->u.rect.thickness = 0;  /* filled */
+    if (ov->type == OV_TYPE_DIM)
+        ov->u.dim.amount = 128;
+
+    /* Advance past <id> и <type> */
+    p = strchr(args, ' ');
+    if (!p) return 0;
+    while (*p == ' ') p++;
+    p = strchr(p, ' ');
+    if (!p) return 0;
+    while (*p == ' ') p++;
+
+    while (*p) {
+        char key[32], val[OVERLAY_TEXT_MAX];
+        int n = 0;
+        if (sscanf(p, "%31[^= \t]=%127s%n", key, val, &n) < 2)
+            break;
+        p += n;
+        while (*p == ' ' || *p == '\t') p++;
+
+        if      (!strcmp(key, "cell"))    ov->cell    = atoi(val);
+        else if (!strcmp(key, "x"))       ov->x       = (float)atof(val);
+        else if (!strcmp(key, "y"))       ov->y       = (float)atof(val);
+        else if (!strcmp(key, "w"))       ov->w       = (float)atof(val);
+        else if (!strcmp(key, "h"))       ov->h       = (float)atof(val);
+        else if (!strcmp(key, "z_order") || !strcmp(key, "z"))
+                                          ov->z_order = atoi(val);
+        else if (!strcmp(key, "opacity")) ov->opacity = av_clip(atoi(val), 0, 255);
+        else if (!strcmp(key, "visible")) ov->visible = atoi(val) ? 1 : 0;
+        else if (ov->type == OV_TYPE_RECT) {
+            if      (!strcmp(key, "r"))         ov->u.rect.r = av_clip_uint8(atoi(val));
+            else if (!strcmp(key, "g"))         ov->u.rect.g = av_clip_uint8(atoi(val));
+            else if (!strcmp(key, "b"))         ov->u.rect.b = av_clip_uint8(atoi(val));
+            else if (!strcmp(key, "thickness")) ov->u.rect.thickness = atoi(val);
+        } else if (ov->type == OV_TYPE_TEXT) {
+            if      (!strcmp(key, "text"))      {
+                av_strlcpy(ov->u.text.text, val, sizeof(ov->u.text.text));
+                url_decode_inplace(ov->u.text.text);
+            }
+            else if (!strcmp(key, "font_size")) ov->u.text.font_size = atoi(val);
+            else if (!strcmp(key, "r"))         ov->u.text.r = av_clip_uint8(atoi(val));
+            else if (!strcmp(key, "g"))         ov->u.text.g = av_clip_uint8(atoi(val));
+            else if (!strcmp(key, "b"))         ov->u.text.b = av_clip_uint8(atoi(val));
+        } else if (ov->type == OV_TYPE_ICON) {
+            if (!strcmp(key, "icon_name")) {
+                av_strlcpy(ov->u.icon.icon_name, val, sizeof(ov->u.icon.icon_name));
+                url_decode_inplace(ov->u.icon.icon_name);
+            }
+        } else if (ov->type == OV_TYPE_DIM) {
+            if (!strcmp(key, "amount")) ov->u.dim.amount = av_clip_uint8(atoi(val));
+        }
+    }
+    return 0;
+}
+
+/* Lock must be held by caller. */
+static int overlay_upsert_locked(CudaGridContext *s, const GridOverlay *ov)
+{
+    int i;
+    for (i = 0; i < s->nb_overlays; i++) {
+        if (!strcmp(s->overlays[i].id, ov->id)) {
+            s->overlays[i] = *ov;
+            return 0;
+        }
+    }
+    if (s->nb_overlays >= MAX_OVERLAYS)
+        return AVERROR(ENOSPC);
+    s->overlays[s->nb_overlays++] = *ov;
+    return 0;
+}
+
+static int overlay_remove_locked(CudaGridContext *s, const char *id)
+{
+    int i;
+    for (i = 0; i < s->nb_overlays; i++) {
+        if (!strcmp(s->overlays[i].id, id)) {
+            memmove(&s->overlays[i], &s->overlays[i + 1],
+                    (s->nb_overlays - i - 1) * sizeof(GridOverlay));
+            s->nb_overlays--;
+            return 0;
+        }
+    }
+    return AVERROR(ENOENT);
+}
+
+/* Compute pixel rect для overlay: либо cell-relative, либо absolute.
+ * Clips против output bounds + 2px alignment для NV12 chroma. */
+static int overlay_pixel_rect(CudaGridContext *s, const GridOverlay *ov,
+                              int *out_x, int *out_y, int *out_w, int *out_h)
+{
+    int rx, ry, rw, rh;
+    int base_x, base_y, base_w, base_h;
+
+    if (ov->cell < 0) {
+        base_x = 0;
+        base_y = 0;
+        base_w = s->out_width;
+        base_h = s->out_height;
+    } else if (ov->cell < s->layout->nb_cells) {
+        base_x = s->cell_px[ov->cell].x;
+        base_y = s->cell_px[ov->cell].y;
+        base_w = s->cell_px[ov->cell].w;
+        base_h = s->cell_px[ov->cell].h;
+    } else {
+        /* Cell index out of range — silently skip (overlay broadcast'нет ко всем
+         * cuda_grid instances, layouts с меньшим nb_cells). NOT error чтобы
+         * не валить filter chain. */
+        return 1;
+    }
+
+    rx = base_x + (int)(ov->x * base_w);
+    ry = base_y + (int)(ov->y * base_h);
+    rw = (int)(ov->w * base_w);
+    rh = (int)(ov->h * base_h);
+
+    if (rx < 0)             { rw += rx; rx = 0; }
+    if (ry < 0)             { rh += ry; ry = 0; }
+    if (rx + rw > s->out_width)  rw = s->out_width  - rx;
+    if (ry + rh > s->out_height) rh = s->out_height - ry;
+    rx &= ~1; ry &= ~1; rw &= ~1; rh &= ~1;
+
+    *out_x = rx; *out_y = ry; *out_w = rw; *out_h = rh;
+    return (rw > 0 && rh > 0) ? 0 : 1;  /* 1 → empty, skip */
+}
+
+/* Strip α-blend (opacity 0..255). cu ctx must be pushed.
+ * opacity=255 → solid fill (kernel uses single-pass blend formula). */
+static int render_strip_alpha(AVFilterContext *ctx, AVFrame *out,
+                              int x, int y, int w, int h,
+                              uint8_t Y, uint8_t U, uint8_t V, uint8_t alpha)
+{
+    CudaGridContext *s = ctx->priv;
+    CUdeviceptr dst_y, dst_uv;
+    int ret;
+    int iY = Y, iU = U, iV = V, iA = alpha;
+    int rx, ry, rw, rh;
+
+    x &= ~1; y &= ~1; w &= ~1; h &= ~1;
+    if (w <= 0 || h <= 0 || alpha == 0) return 0;
+
+    /* Y plane */
+    dst_y = (CUdeviceptr)out->data[0];
+    rx = x; ry = y; rw = w; rh = h;
+    {
+        int dst_pitch_y = out->linesize[0];
+        void *args[] = { &dst_y, &dst_pitch_y, &rx, &ry, &rw, &rh, &iY, &iA };
+        ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuLaunchKernel(
+            s->cu_func_alpha_fill_y,
+            DIV_UP(w, BLOCKX), DIV_UP(h, BLOCKY), 1,
+            BLOCKX, BLOCKY, 1,
+            0, s->cu_stream, args, NULL));
+        if (ret < 0) return ret;
+    }
+
+    /* UV plane (half res, interleaved 2 bytes per chroma sample) */
+    dst_uv = (CUdeviceptr)out->data[1];
+    rx = x / 2; ry = y / 2; rw = w / 2; rh = h / 2;
+    {
+        int dst_pitch_uv = out->linesize[1];
+        void *args[] = { &dst_uv, &dst_pitch_uv, &rx, &ry, &rw, &rh, &iU, &iV, &iA };
+        ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuLaunchKernel(
+            s->cu_func_alpha_fill_uv,
+            DIV_UP(w / 2, BLOCKX), DIV_UP(h / 2, BLOCKY), 1,
+            BLOCKX, BLOCKY, 1,
+            0, s->cu_stream, args, NULL));
+        if (ret < 0) return ret;
+    }
+    return 0;
+}
+
+/* ─── Text rendering (Phase 4b-3, freetype) ────────────────────────────── */
+
+#if CONFIG_LIBFREETYPE
+
+static const char *const default_fonts[] = {
+    "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
+    "/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf",
+    "/usr/share/fonts/TTF/DejaVuSans-Bold.ttf",
+    "/usr/share/fonts/dejavu/DejaVuSans-Bold.ttf",
+    NULL,
+};
+
+static int ensure_ft_loaded(AVFilterContext *ctx)
+{
+    CudaGridContext *s = ctx->priv;
+    const char *path = s->font_file;
+    FT_Error err;
+    int i;
+
+    if (s->ft_ready) return 0;
+    if (s->ft_init_attempted) return AVERROR(ENOSYS);
+    s->ft_init_attempted = 1;
+
+    err = FT_Init_FreeType(&s->ft_lib);
+    if (err) {
+        av_log(ctx, AV_LOG_WARNING, "FT_Init_FreeType failed (err=%d), text overlays disabled\n", err);
+        return AVERROR(ENOSYS);
+    }
+
+    if (!path) {
+        for (i = 0; default_fonts[i]; i++) {
+            if (access(default_fonts[i], R_OK) == 0) {
+                path = default_fonts[i];
+                break;
+            }
+        }
+    }
+    if (!path) {
+        av_log(ctx, AV_LOG_WARNING, "no font found (set font_file= option), text disabled\n");
+        FT_Done_FreeType(s->ft_lib);
+        s->ft_lib = NULL;
+        return AVERROR(ENOENT);
+    }
+
+    err = FT_New_Face(s->ft_lib, path, 0, &s->ft_face);
+    if (err) {
+        av_log(ctx, AV_LOG_WARNING, "FT_New_Face(%s) failed: %d, text disabled\n", path, err);
+        FT_Done_FreeType(s->ft_lib);
+        s->ft_lib = NULL;
+        return AVERROR(EIO);
+    }
+
+    av_log(ctx, AV_LOG_INFO, "freetype: loaded %s\n", path);
+    s->ft_ready = 1;
+    return 0;
+}
+
+/* Two-pass rasterize: 1) measure total_w / ascent / descent, 2) blit to RGBA buffer.
+ * Returns malloc'd RGBA buffer (caller frees) + dims via out params, or NULL. */
+static uint8_t *rasterize_text_rgba(AVFilterContext *ctx, const char *text, int font_size,
+                                    uint8_t r, uint8_t g, uint8_t b,
+                                    int *out_w, int *out_h)
+{
+    CudaGridContext *s = ctx->priv;
+    FT_Error err;
+    const unsigned char *p;
+    int total_w = 0, ascent_px = 0, descent_px = 0;
+    int atlas_w, atlas_h, pen_x, baseline_y;
+    uint8_t *atlas;
+
+    if (!s->ft_ready) return NULL;
+    err = FT_Set_Pixel_Sizes(s->ft_face, 0, font_size);
+    if (err) {
+        av_log(ctx, AV_LOG_WARNING, "FT_Set_Pixel_Sizes failed: %d\n", err);
+        return NULL;
+    }
+
+    /* Pass 1: measure */
+    for (p = (const unsigned char *)text; *p; p++) {
+        FT_UInt gi;
+        FT_GlyphSlot g_slot;
+        gi = FT_Get_Char_Index(s->ft_face, *p);
+        if (!gi) continue;
+        if (FT_Load_Glyph(s->ft_face, gi, FT_LOAD_DEFAULT)) continue;
+        g_slot = s->ft_face->glyph;
+        total_w  += (g_slot->advance.x >> 6);
+        ascent_px  = FFMAX(ascent_px,  g_slot->bitmap_top);
+        descent_px = FFMAX(descent_px, (int)g_slot->bitmap.rows - g_slot->bitmap_top);
+    }
+    if (total_w <= 0) return NULL;
+
+    atlas_w = total_w + 4;  /* padding */
+    atlas_h = ascent_px + descent_px + 4;
+    if (atlas_h <= 0) atlas_h = font_size;
+
+    atlas = av_mallocz((size_t)atlas_w * atlas_h * 4);
+    if (!atlas) return NULL;
+
+    /* Pass 2: blit */
+    pen_x      = 2;
+    baseline_y = ascent_px + 2;
+    for (p = (const unsigned char *)text; *p; p++) {
+        FT_UInt gi;
+        FT_GlyphSlot g_slot;
+        int gx, gy, bw, bh;
+        int yy, xx;
+
+        gi = FT_Get_Char_Index(s->ft_face, *p);
+        if (!gi) continue;
+        if (FT_Load_Glyph(s->ft_face, gi, FT_LOAD_RENDER)) continue;
+        g_slot = s->ft_face->glyph;
+        if (g_slot->bitmap.pixel_mode != FT_PIXEL_MODE_GRAY) {
+            pen_x += g_slot->advance.x >> 6;
+            continue;
+        }
+        bw = g_slot->bitmap.width;
+        bh = g_slot->bitmap.rows;
+        gx = pen_x + g_slot->bitmap_left;
+        gy = baseline_y - g_slot->bitmap_top;
+
+        for (yy = 0; yy < bh; yy++) {
+            int dy = gy + yy;
+            if (dy < 0 || dy >= atlas_h) continue;
+            for (xx = 0; xx < bw; xx++) {
+                int dx = gx + xx;
+                uint8_t gray;
+                uint8_t *dst;
+                if (dx < 0 || dx >= atlas_w) continue;
+                gray = g_slot->bitmap.buffer[yy * g_slot->bitmap.pitch + xx];
+                if (gray == 0) continue;
+                dst = atlas + ((size_t)dy * atlas_w + dx) * 4;
+                /* Premultiplied: max() — text "fills" rather than blending overlaps */
+                if (gray > dst[3]) {
+                    dst[0] = r;
+                    dst[1] = g;
+                    dst[2] = b;
+                    dst[3] = gray;
+                }
+            }
+        }
+        pen_x += g_slot->advance.x >> 6;
+    }
+
+    *out_w = atlas_w;
+    *out_h = atlas_h;
+    return atlas;
+}
+
+#endif  /* CONFIG_LIBFREETYPE */
+
+/* Cache lookup — locked by caller. */
+static TextAtlas *atlas_find_locked(CudaGridContext *s, const char *id)
+{
+    int i;
+    for (i = 0; i < s->nb_atlases; i++)
+        if (!strcmp(s->atlases[i].id, id))
+            return &s->atlases[i];
+    return NULL;
+}
+
+static void atlas_free_gpu_locked(CudaGridContext *s, TextAtlas *a)
+{
+    if (a->device_ptr) {
+        s->hwctx->internal->cuda_dl->cuMemFree(a->device_ptr);
+        a->device_ptr = 0;
+    }
+}
+
+static void atlas_remove_locked(CudaGridContext *s, const char *id)
+{
+    int i;
+    for (i = 0; i < s->nb_atlases; i++) {
+        if (!strcmp(s->atlases[i].id, id)) {
+            atlas_free_gpu_locked(s, &s->atlases[i]);
+            memmove(&s->atlases[i], &s->atlases[i + 1],
+                    (s->nb_atlases - i - 1) * sizeof(TextAtlas));
+            s->nb_atlases--;
+            return;
+        }
+    }
+}
+
+#if CONFIG_LIBFREETYPE
+/* Ensure GPU atlas for given overlay matches its text/font_size/color. cu ctx pushed by caller. */
+static int ensure_text_atlas(AVFilterContext *ctx, const GridOverlay *ov, TextAtlas **out_atlas)
+{
+    CudaGridContext *s = ctx->priv;
+    TextAtlas *a;
+    uint8_t *cpu_atlas;
+    int aw, ah;
+    int font_size;
+    CUdeviceptr dev_ptr = 0;
+    size_t dev_pitch = 0;
+    CUDA_MEMCPY2D cpy = { 0 };
+    int ret;
+
+    font_size = ov->u.text.font_size > 0 ? ov->u.text.font_size : s->default_font_size;
+
+    a = atlas_find_locked(s, ov->id);
+    if (a &&
+        !strcmp(a->cached_text, ov->u.text.text) &&
+        a->cached_font_size == font_size &&
+        a->cached_r == ov->u.text.r &&
+        a->cached_g == ov->u.text.g &&
+        a->cached_b == ov->u.text.b) {
+        *out_atlas = a;
+        return 0;
+    }
+
+    if (s->nb_atlases >= MAX_OVERLAYS && !a)
+        return AVERROR(ENOSPC);
+
+    ret = ensure_ft_loaded(ctx);
+    if (ret < 0) return ret;
+
+    cpu_atlas = rasterize_text_rgba(ctx, ov->u.text.text, font_size,
+                                    ov->u.text.r, ov->u.text.g, ov->u.text.b,
+                                    &aw, &ah);
+    if (!cpu_atlas) return AVERROR(EINVAL);
+
+    /* Allocate pitched device buffer + upload */
+    ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuMemAllocPitch(
+        &dev_ptr, &dev_pitch, (size_t)aw * 4, (size_t)ah, 4));
+    if (ret < 0) { av_free(cpu_atlas); return ret; }
+
+    cpy.srcMemoryType = CU_MEMORYTYPE_HOST;
+    cpy.srcHost       = cpu_atlas;
+    cpy.srcPitch      = (size_t)aw * 4;
+    cpy.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+    cpy.dstDevice     = dev_ptr;
+    cpy.dstPitch      = dev_pitch;
+    cpy.WidthInBytes  = (size_t)aw * 4;
+    cpy.Height        = (size_t)ah;
+    ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuMemcpy2D(&cpy));
+    av_free(cpu_atlas);
+    if (ret < 0) {
+        s->hwctx->internal->cuda_dl->cuMemFree(dev_ptr);
+        return ret;
+    }
+
+    if (a) {
+        atlas_free_gpu_locked(s, a);
+    } else {
+        a = &s->atlases[s->nb_atlases++];
+        memset(a, 0, sizeof(*a));
+        av_strlcpy(a->id, ov->id, sizeof(a->id));
+    }
+    av_strlcpy(a->cached_text, ov->u.text.text, sizeof(a->cached_text));
+    a->cached_font_size = font_size;
+    a->cached_r = ov->u.text.r;
+    a->cached_g = ov->u.text.g;
+    a->cached_b = ov->u.text.b;
+    a->device_ptr   = dev_ptr;
+    a->device_pitch = dev_pitch;
+    a->w = aw;
+    a->h = ah;
+    *out_atlas = a;
+    return 0;
+}
+
+static int render_overlay_text(AVFilterContext *ctx, AVFrame *out, const GridOverlay *ov)
+{
+    CudaGridContext *s = ctx->priv;
+    TextAtlas *a = NULL;
+    int px, py;
+    int base_x, base_y, base_w, base_h;
+    CUdeviceptr dst_y, dst_uv;
+    int ret;
+    int aw, ah, ap, ealpha;
+
+    if (!ov->u.text.text[0]) return 0;
+    ret = ensure_text_atlas(ctx, ov, &a);
+    if (ret < 0) {
+        av_log(ctx, AV_LOG_WARNING, "text overlay %s: atlas failed (ret=%d)\n", ov->id, ret);
+        return 0; /* non-fatal — skip */
+    }
+    if (!a) return 0;
+
+    /* Position: x,y normalized within cell or output (top-left corner of atlas). */
+    if (ov->cell < 0) {
+        base_x = 0; base_y = 0;
+        base_w = s->out_width; base_h = s->out_height;
+    } else if (ov->cell < s->layout->nb_cells) {
+        base_x = s->cell_px[ov->cell].x;
+        base_y = s->cell_px[ov->cell].y;
+        base_w = s->cell_px[ov->cell].w;
+        base_h = s->cell_px[ov->cell].h;
+    } else {
+        return 0;
+    }
+    px = base_x + (int)(ov->x * base_w);
+    py = base_y + (int)(ov->y * base_h);
+    px &= ~1; py &= ~1;
+    /* Clip — if atlas runs past output bounds, kernels still safe (they bounds-check
+     * against atlas w/h, not dst). But avoid going off output: simple skip when fully outside. */
+    if (px >= s->out_width || py >= s->out_height) return 0;
+
+    aw = a->w; ah = a->h;
+    /* Reduce ah if it overflows output (avoid OOB on Y plane); UV plane uses h/2 so same scale. */
+    if (px + aw > s->out_width)  aw = s->out_width  - px;
+    if (py + ah > s->out_height) ah = s->out_height - py;
+    aw &= ~1; ah &= ~1;
+    if (aw <= 0 || ah <= 0) return 0;
+
+    ap = (int)a->device_pitch;
+    ealpha = ov->opacity;
+
+    /* Y plane */
+    dst_y = (CUdeviceptr)out->data[0];
+    {
+        int dst_pitch_y = out->linesize[0];
+        CUdeviceptr atl = a->device_ptr;
+        void *args[] = { &dst_y, &dst_pitch_y, &px, &py, &atl, &ap, &aw, &ah, &ealpha };
+        ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuLaunchKernel(
+            s->cu_func_alpha_blit_rgba_y,
+            DIV_UP(aw, BLOCKX), DIV_UP(ah, BLOCKY), 1,
+            BLOCKX, BLOCKY, 1,
+            0, s->cu_stream, args, NULL));
+        if (ret < 0) return ret;
+    }
+    /* UV plane (kernel does internal half-res subsampling) */
+    dst_uv = (CUdeviceptr)out->data[1];
+    {
+        int dst_pitch_uv = out->linesize[1];
+        CUdeviceptr atl = a->device_ptr;
+        void *args[] = { &dst_uv, &dst_pitch_uv, &px, &py, &atl, &ap, &aw, &ah, &ealpha };
+        ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuLaunchKernel(
+            s->cu_func_alpha_blit_rgba_uv,
+            DIV_UP(aw / 2, BLOCKX), DIV_UP(ah / 2, BLOCKY), 1,
+            BLOCKX, BLOCKY, 1,
+            0, s->cu_stream, args, NULL));
+        if (ret < 0) return ret;
+    }
+    return 0;
+}
+
+#endif  /* CONFIG_LIBFREETYPE */
+
+/* ─── Icon rendering (Phase 4b-4, PNG via libavcodec + swscale) ────────── */
+
+static const char *const default_icon_dirs[] = {
+    "/var/lib/cuda-grid/icons",
+    "/opt/cuda-grid/icons",
+    NULL,
+};
+
+/* Read image file, decode (PNG/JPEG/etc.), convert to packed RGBA8.
+ * Returns malloc'd buffer (caller frees) + dims, or NULL on error. */
+static uint8_t *load_image_to_rgba_cpu(AVFilterContext *ctx, const char *path,
+                                       int *out_w, int *out_h)
+{
+    AVFormatContext *fmt_ctx = NULL;
+    AVCodecContext  *cctx    = NULL;
+    AVFrame         *frame   = NULL;
+    AVPacket        *pkt     = NULL;
+    struct SwsContext *sws   = NULL;
+    const AVCodec   *codec;
+    int stream_idx;
+    uint8_t *rgba = NULL;
+    int rgba_pitch;
+    int ret;
+
+    if (avformat_open_input(&fmt_ctx, path, NULL, NULL) < 0) {
+        av_log(ctx, AV_LOG_WARNING, "icon: open %s failed\n", path);
+        return NULL;
+    }
+    if (avformat_find_stream_info(fmt_ctx, NULL) < 0) goto fail;
+
+    stream_idx = av_find_best_stream(fmt_ctx, AVMEDIA_TYPE_VIDEO, -1, -1, NULL, 0);
+    if (stream_idx < 0) goto fail;
+
+    codec = avcodec_find_decoder(fmt_ctx->streams[stream_idx]->codecpar->codec_id);
+    if (!codec) goto fail;
+
+    cctx = avcodec_alloc_context3(codec);
+    if (!cctx) goto fail;
+    if (avcodec_parameters_to_context(cctx, fmt_ctx->streams[stream_idx]->codecpar) < 0) goto fail;
+    if (avcodec_open2(cctx, codec, NULL) < 0) goto fail;
+
+    pkt   = av_packet_alloc();
+    frame = av_frame_alloc();
+    if (!pkt || !frame) goto fail;
+
+    if (av_read_frame(fmt_ctx, pkt) < 0) goto fail;
+    if (avcodec_send_packet(cctx, pkt) < 0) goto fail;
+    ret = avcodec_receive_frame(cctx, frame);
+    if (ret < 0) goto fail;
+
+    sws = sws_getContext(frame->width, frame->height, frame->format,
+                         frame->width, frame->height, AV_PIX_FMT_RGBA,
+                         SWS_BILINEAR, NULL, NULL, NULL);
+    if (!sws) goto fail;
+
+    rgba_pitch = frame->width * 4;
+    rgba = av_malloc((size_t)rgba_pitch * frame->height);
+    if (!rgba) goto fail;
+
+    {
+        uint8_t *out_data[4]  = { rgba, NULL, NULL, NULL };
+        int      out_pitch[4] = { rgba_pitch, 0, 0, 0 };
+        sws_scale(sws, (const uint8_t * const *)frame->data, frame->linesize,
+                  0, frame->height, out_data, out_pitch);
+    }
+
+    *out_w = frame->width;
+    *out_h = frame->height;
+
+fail:
+    if (sws)     sws_freeContext(sws);
+    if (frame)   av_frame_free(&frame);
+    if (pkt)     av_packet_free(&pkt);
+    if (cctx)    avcodec_free_context(&cctx);
+    if (fmt_ctx) avformat_close_input(&fmt_ctx);
+    return rgba;
+}
+
+/* Resolve icon name к full path (search dirs + extensions). */
+static int resolve_icon_path(AVFilterContext *ctx, const char *icon_name,
+                             char *out_path, size_t out_path_sz)
+{
+    CudaGridContext *s = ctx->priv;
+    static const char *const exts[] = { "", ".png", ".jpg", NULL };
+    const char *dirs[8] = { 0 };
+    int i, j, ndir = 0;
+
+    /* Если absolute path — use as-is */
+    if (icon_name[0] == '/') {
+        if (access(icon_name, R_OK) == 0) {
+            av_strlcpy(out_path, icon_name, out_path_sz);
+            return 0;
+        }
+        return AVERROR(ENOENT);
+    }
+
+    if (s->icon_dir) dirs[ndir++] = s->icon_dir;
+    for (i = 0; default_icon_dirs[i] && ndir < 7; i++) dirs[ndir++] = default_icon_dirs[i];
+
+    for (i = 0; i < ndir; i++) {
+        for (j = 0; exts[j]; j++) {
+            snprintf(out_path, out_path_sz, "%s/%s%s", dirs[i], icon_name, exts[j]);
+            if (access(out_path, R_OK) == 0) return 0;
+        }
+    }
+    return AVERROR(ENOENT);
+}
+
+static IconAtlas *icon_atlas_find_locked(CudaGridContext *s, const char *name)
+{
+    int i;
+    for (i = 0; i < s->nb_icon_atlases; i++)
+        if (!strcmp(s->icon_atlases[i].icon_name, name))
+            return &s->icon_atlases[i];
+    return NULL;
+}
+
+/* Ensure GPU atlas for icon_name loaded. cu ctx pushed by caller. */
+static int ensure_icon_atlas(AVFilterContext *ctx, const char *icon_name, IconAtlas **out)
+{
+    CudaGridContext *s = ctx->priv;
+    IconAtlas *a;
+    uint8_t *cpu_rgba;
+    char path[512];
+    int aw, ah;
+    CUdeviceptr dev_ptr = 0;
+    size_t dev_pitch = 0;
+    CUDA_MEMCPY2D cpy = { 0 };
+    int ret;
+
+    a = icon_atlas_find_locked(s, icon_name);
+    if (a && a->device_ptr) { *out = a; return 0; }
+    if (s->nb_icon_atlases >= MAX_ICON_ATLASES && !a) return AVERROR(ENOSPC);
+
+    ret = resolve_icon_path(ctx, icon_name, path, sizeof(path));
+    if (ret < 0) {
+        av_log(ctx, AV_LOG_WARNING, "icon '%s' not found in search paths\n", icon_name);
+        return ret;
+    }
+
+    cpu_rgba = load_image_to_rgba_cpu(ctx, path, &aw, &ah);
+    if (!cpu_rgba) return AVERROR(EINVAL);
+
+    ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuMemAllocPitch(
+        &dev_ptr, &dev_pitch, (size_t)aw * 4, (size_t)ah, 4));
+    if (ret < 0) { av_free(cpu_rgba); return ret; }
+
+    cpy.srcMemoryType = CU_MEMORYTYPE_HOST;
+    cpy.srcHost       = cpu_rgba;
+    cpy.srcPitch      = (size_t)aw * 4;
+    cpy.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+    cpy.dstDevice     = dev_ptr;
+    cpy.dstPitch      = dev_pitch;
+    cpy.WidthInBytes  = (size_t)aw * 4;
+    cpy.Height        = (size_t)ah;
+    ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuMemcpy2D(&cpy));
+    av_free(cpu_rgba);
+    if (ret < 0) {
+        s->hwctx->internal->cuda_dl->cuMemFree(dev_ptr);
+        return ret;
+    }
+
+    if (!a) {
+        a = &s->icon_atlases[s->nb_icon_atlases++];
+        memset(a, 0, sizeof(*a));
+        av_strlcpy(a->icon_name, icon_name, sizeof(a->icon_name));
+    }
+    a->device_ptr   = dev_ptr;
+    a->device_pitch = dev_pitch;
+    a->w = aw;
+    a->h = ah;
+    *out = a;
+    av_log(ctx, AV_LOG_INFO, "icon '%s' loaded %s (%dx%d)\n", icon_name, path, aw, ah);
+    return 0;
+}
+
+static int render_overlay_icon(AVFilterContext *ctx, AVFrame *out, const GridOverlay *ov)
+{
+    CudaGridContext *s = ctx->priv;
+    IconAtlas *a = NULL;
+    int px, py;
+    int base_x, base_y, base_w, base_h;
+    int aw, ah, ap, ealpha;
+    CUdeviceptr dst_y, dst_uv;
+    int ret;
+
+    if (!ov->u.icon.icon_name[0]) return 0;
+    ret = ensure_icon_atlas(ctx, ov->u.icon.icon_name, &a);
+    if (ret < 0 || !a) return 0;  /* non-fatal — skip */
+
+    if (ov->cell < 0) {
+        base_x = 0; base_y = 0;
+        base_w = s->out_width; base_h = s->out_height;
+    } else if (ov->cell < s->layout->nb_cells) {
+        base_x = s->cell_px[ov->cell].x;
+        base_y = s->cell_px[ov->cell].y;
+        base_w = s->cell_px[ov->cell].w;
+        base_h = s->cell_px[ov->cell].h;
+    } else return 0;
+
+    px = base_x + (int)(ov->x * base_w);
+    py = base_y + (int)(ov->y * base_h);
+    px &= ~1; py &= ~1;
+    if (px >= s->out_width || py >= s->out_height) return 0;
+
+    aw = a->w; ah = a->h;
+    if (px + aw > s->out_width)  aw = s->out_width  - px;
+    if (py + ah > s->out_height) ah = s->out_height - py;
+    aw &= ~1; ah &= ~1;
+    if (aw <= 0 || ah <= 0) return 0;
+
+    ap = (int)a->device_pitch;
+    ealpha = ov->opacity;
+
+    dst_y = (CUdeviceptr)out->data[0];
+    {
+        int dst_pitch_y = out->linesize[0];
+        CUdeviceptr atl = a->device_ptr;
+        void *args[] = { &dst_y, &dst_pitch_y, &px, &py, &atl, &ap, &aw, &ah, &ealpha };
+        ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuLaunchKernel(
+            s->cu_func_alpha_blit_rgba_y,
+            DIV_UP(aw, BLOCKX), DIV_UP(ah, BLOCKY), 1,
+            BLOCKX, BLOCKY, 1,
+            0, s->cu_stream, args, NULL));
+        if (ret < 0) return ret;
+    }
+    dst_uv = (CUdeviceptr)out->data[1];
+    {
+        int dst_pitch_uv = out->linesize[1];
+        CUdeviceptr atl = a->device_ptr;
+        void *args[] = { &dst_uv, &dst_pitch_uv, &px, &py, &atl, &ap, &aw, &ah, &ealpha };
+        ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuLaunchKernel(
+            s->cu_func_alpha_blit_rgba_uv,
+            DIV_UP(aw / 2, BLOCKX), DIV_UP(ah / 2, BLOCKY), 1,
+            BLOCKX, BLOCKY, 1,
+            0, s->cu_stream, args, NULL));
+        if (ret < 0) return ret;
+    }
+    return 0;
+}
+
+static int render_overlay_rect(AVFilterContext *ctx, AVFrame *out, const GridOverlay *ov)
+{
+    CudaGridContext *s = ctx->priv;
+    int px, py, pw, ph;
+    uint8_t Y, U, V;
+    int ret;
+
+    ret = overlay_pixel_rect(s, ov, &px, &py, &pw, &ph);
+    if (ret != 0) return ret < 0 ? ret : 0;
+
+    rgb_to_yuv709(ov->u.rect.r, ov->u.rect.g, ov->u.rect.b, &Y, &U, &V);
+
+    if (ov->u.rect.thickness <= 0) {
+        /* Filled rect */
+        return render_strip_alpha(ctx, out, px, py, pw, ph, Y, U, V, ov->opacity);
+    } else {
+        int t = ov->u.rect.thickness;
+        t = FFMIN(t, FFMIN(pw / 2, ph / 2));
+        if (t < 2) t = 2;
+        t &= ~1;
+        /* Top */
+        ret = render_strip_alpha(ctx, out, px, py, pw, t, Y, U, V, ov->opacity);
+        if (ret < 0) return ret;
+        /* Bottom */
+        ret = render_strip_alpha(ctx, out, px, py + ph - t, pw, t, Y, U, V, ov->opacity);
+        if (ret < 0) return ret;
+        /* Left */
+        ret = render_strip_alpha(ctx, out, px, py + t, t, ph - 2 * t, Y, U, V, ov->opacity);
+        if (ret < 0) return ret;
+        /* Right */
+        ret = render_strip_alpha(ctx, out, px + pw - t, py + t, t, ph - 2 * t, Y, U, V, ov->opacity);
+        return ret;
+    }
+}
+
+/* Dim overlay: рамка тёмная (Y=16, neutral UV) полупрозрачная по cell. */
+static int render_overlay_dim(AVFilterContext *ctx, AVFrame *out, const GridOverlay *ov)
+{
+    CudaGridContext *s = ctx->priv;
+    int px, py, pw, ph;
+    int ret;
+
+    ret = overlay_pixel_rect(s, ov, &px, &py, &pw, &ph);
+    if (ret != 0) return ret < 0 ? ret : 0;
+
+    /* Y=16 = darkest legal Y (BT.709 limited range), UV=128 = neutral chroma */
+    return render_strip_alpha(ctx, out, px, py, pw, ph, 16, 128, 128, ov->u.dim.amount);
+}
+
+/* Render all visible overlays. cu ctx must be pushed by caller. */
+static int render_overlays(AVFilterContext *ctx, AVFrame *out)
+{
+    CudaGridContext *s = ctx->priv;
+    GridOverlay sorted[MAX_OVERLAYS];
+    int i, j, n, ret;
+
+    pthread_mutex_lock(&s->overlay_lock);
+    n = s->nb_overlays;
+    memcpy(sorted, s->overlays, (size_t)n * sizeof(GridOverlay));
+    pthread_mutex_unlock(&s->overlay_lock);
+
+    /* Insertion sort by z_order (stable). n ≤ 64 → fine. */
+    for (i = 1; i < n; i++) {
+        GridOverlay tmp = sorted[i];
+        for (j = i; j > 0 && sorted[j - 1].z_order > tmp.z_order; j--)
+            sorted[j] = sorted[j - 1];
+        sorted[j] = tmp;
+    }
+
+    for (i = 0; i < n; i++) {
+        const GridOverlay *ov = &sorted[i];
+        if (!ov->visible) continue;
+        switch (ov->type) {
+        case OV_TYPE_RECT:
+            ret = render_overlay_rect(ctx, out, ov);
+            if (ret < 0) return ret;
+            break;
+        case OV_TYPE_DIM:
+            ret = render_overlay_dim(ctx, out, ov);
+            if (ret < 0) return ret;
+            break;
+        case OV_TYPE_TEXT:
+#if CONFIG_LIBFREETYPE
+            ret = render_overlay_text(ctx, out, ov);
+            if (ret < 0) return ret;
+#else
+            av_log(ctx, AV_LOG_TRACE, "overlay %s: text — libfreetype disabled at build\n", ov->id);
+#endif
+            break;
+        case OV_TYPE_ICON:
+            ret = render_overlay_icon(ctx, out, ov);
+            if (ret < 0) return ret;
+            break;
+        }
+    }
+    return 0;
+}
+
+/* ─── Framesync callback ──────────────────────────────────────────────── */
+
+static int cuda_grid_compose(FFFrameSync *fs)
+{
+    AVFilterContext *ctx     = fs->parent;
+    AVFilterLink    *outlink = ctx->outputs[0];
+    CudaGridContext *s       = ctx->priv;
+    AVFrame *out = NULL;
+    AVFrame *in[MAX_CELLS] = {0};
+    CUcontext dummy;
+    int i, ret;
+    int nb = s->layout->nb_cells;
+
+    for (i = 0; i < nb; i++) {
+        ret = ff_framesync_get_frame(fs, i, &in[i], 0);
+        if (ret < 0)
+            return ret;
+        if (!in[i]) {
+            av_log(ctx, AV_LOG_WARNING, "input %d not ready, skipping frame\n", i);
+            return 0;
+        }
+    }
+
+    out = ff_get_video_buffer(outlink, s->out_width, s->out_height);
+    if (!out)
+        return AVERROR(ENOMEM);
+
+    ret = av_frame_copy_props(out, in[0]);
+    if (ret < 0)
+        goto fail;
+    out->width  = s->out_width;
+    out->height = s->out_height;
+
+    ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPushCurrent(s->cu_ctx));
+    if (ret < 0)
+        goto fail;
+
+    for (i = 0; i < nb; i++) {
+        AVFrame *src = in[i];
+        int cx = s->cell_px[i].x;
+        int cy = s->cell_px[i].y;
+        int cw = s->cell_px[i].w;
+        int ch = s->cell_px[i].h;
+
+        if (src->width != cw || src->height != ch) {
+            av_log(ctx, AV_LOG_ERROR,
+                   "input %d size %dx%d != cell size %dx%d. "
+                   "cuda_grid не делает scaling — используй upstream scale_npp:\n"
+                   "  [in%d]scale_npp=%d:%d[scaled%d]; [scaled%d]...cuda_grid=layout=%s\n",
+                   i, src->width, src->height, cw, ch,
+                   i, cw, ch, i, i, s->layout->name);
+            CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
+            ret = AVERROR(EINVAL);
+            goto fail;
+        }
+
+        /* Y plane (1 byte per pixel) */
+        ret = copy_input_plane(ctx,
+                               (CUdeviceptr)src->data[0], src->linesize[0],
+                               src->width, src->height,
+                               (CUdeviceptr)out->data[0], out->linesize[0],
+                               cx, cy, 1);
+        if (ret < 0) {
+            CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
+            goto fail;
+        }
+
+        /* UV plane (NV12: half resolution, 2 bytes per "pixel") */
+        ret = copy_input_plane(ctx,
+                               (CUdeviceptr)src->data[1], src->linesize[1],
+                               src->width / 2, src->height / 2,
+                               (CUdeviceptr)out->data[1], out->linesize[1],
+                               cx / 2, cy / 2, 2);
+        if (ret < 0) {
+            CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
+            goto fail;
+        }
+    }
+
+    /* Overlay pass (Phase 4b) */
+    ret = render_overlays(ctx, out);
+    if (ret < 0) {
+        CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
+        goto fail;
+    }
+
+    CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
+    return ff_filter_frame(outlink, out);
+
+fail:
+    av_frame_free(&out);
+    return ret;
+}
+
+/* ─── Lifecycle ────────────────────────────────────────────────────────── */
+
+static av_cold int cuda_grid_init(AVFilterContext *ctx)
+{
+    CudaGridContext *s = ctx->priv;
+    const LayoutTemplate *lt;
+    int i, ret;
+
+    lt = find_layout(s->layout_name);
+    if (!lt) {
+        av_log(ctx, AV_LOG_ERROR, "unknown layout '%s'. Доступны: ", s->layout_name);
+        for (i = 0; i < (int)FF_ARRAY_ELEMS(layouts); i++)
+            av_log(ctx, AV_LOG_ERROR, "%s ", layouts[i].name);
+        av_log(ctx, AV_LOG_ERROR, "\n");
+        return AVERROR(EINVAL);
+    }
+    s->layout = lt;
+
+    /* Dynamic inputs — append pad per cell */
+    for (i = 0; i < lt->nb_cells; i++) {
+        AVFilterPad pad = { 0 };
+        pad.type = AVMEDIA_TYPE_VIDEO;
+        pad.name = av_asprintf("input%d", i);
+        if (!pad.name)
+            return AVERROR(ENOMEM);
+        ret = ff_append_inpad_free_name(ctx, &pad);
+        if (ret < 0)
+            return ret;
+    }
+
+    /* Overlay mutex */
+    ret = pthread_mutex_init(&s->overlay_lock, NULL);
+    if (ret) {
+        av_log(ctx, AV_LOG_ERROR, "overlay_lock init failed: %d\n", ret);
+        return AVERROR(ENOMEM);
+    }
+    s->overlay_lock_inited = 1;
+
+    av_log(ctx, AV_LOG_INFO, "cuda_grid layout=%s cells=%d output=%dx%d (overlays: rect)\n",
+           lt->name, lt->nb_cells, s->out_width, s->out_height);
+    return 0;
+}
+
+static av_cold void cuda_grid_uninit(AVFilterContext *ctx)
+{
+    CudaGridContext *s = ctx->priv;
+    int i;
+    ff_framesync_uninit(&s->fs);
+
+    if (s->hwctx) {
+        CUcontext dummy;
+        CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPushCurrent(s->cu_ctx));
+        for (i = 0; i < s->nb_atlases; i++) {
+            if (s->atlases[i].device_ptr) {
+                s->hwctx->internal->cuda_dl->cuMemFree(s->atlases[i].device_ptr);
+                s->atlases[i].device_ptr = 0;
+            }
+        }
+        s->nb_atlases = 0;
+        for (i = 0; i < s->nb_icon_atlases; i++) {
+            if (s->icon_atlases[i].device_ptr) {
+                s->hwctx->internal->cuda_dl->cuMemFree(s->icon_atlases[i].device_ptr);
+                s->icon_atlases[i].device_ptr = 0;
+            }
+        }
+        s->nb_icon_atlases = 0;
+        if (s->cu_module) {
+            CHECK_CU(s->hwctx->internal->cuda_dl->cuModuleUnload(s->cu_module));
+            s->cu_module = NULL;
+        }
+        CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
+    }
+#if CONFIG_LIBFREETYPE
+    if (s->ft_face) { FT_Done_Face(s->ft_face); s->ft_face = NULL; }
+    if (s->ft_lib)  { FT_Done_FreeType(s->ft_lib); s->ft_lib = NULL; }
+#endif
+    if (s->overlay_lock_inited) {
+        pthread_mutex_destroy(&s->overlay_lock);
+        s->overlay_lock_inited = 0;
+    }
+}
+
+static int cuda_grid_process_command(AVFilterContext *ctx, const char *cmd,
+                                     const char *arg, char *res, int res_len, int flags)
+{
+    CudaGridContext *s = ctx->priv;
+    int ret;
+
+    if (!strcmp(cmd, "add_overlay")) {
+        GridOverlay ov;
+        ret = parse_overlay_args(ctx, arg, &ov);
+        if (ret < 0) {
+            if (res) av_strlcpy(res, "err parse", res_len);
+            return ret;
+        }
+        pthread_mutex_lock(&s->overlay_lock);
+        ret = overlay_upsert_locked(s, &ov);
+        pthread_mutex_unlock(&s->overlay_lock);
+        if (res) snprintf(res, res_len, ret == 0 ? "ok id=%s n=%d" : "err full id=%s",
+                          ov.id, s->nb_overlays);
+        return ret;
+    }
+
+    if (!strcmp(cmd, "remove_overlay")) {
+        char id[OVERLAY_ID_MAX];
+        if (!arg || sscanf(arg, "%31s", id) != 1) {
+            if (res) av_strlcpy(res, "err parse", res_len);
+            return AVERROR(EINVAL);
+        }
+        pthread_mutex_lock(&s->overlay_lock);
+        ret = overlay_remove_locked(s, id);
+        /* Best-effort atlas GPU cleanup: cu ctx may not be pushed here, push it. */
+        if (ret == 0 && s->hwctx) {
+            CUcontext dummy;
+            CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPushCurrent(s->cu_ctx));
+            atlas_remove_locked(s, id);
+            CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
+        }
+        pthread_mutex_unlock(&s->overlay_lock);
+        if (res) snprintf(res, res_len, ret == 0 ? "ok id=%s" : "not_found id=%s", id);
+        return ret;
+    }
+
+    if (!strcmp(cmd, "clear_overlays")) {
+        int i;
+        pthread_mutex_lock(&s->overlay_lock);
+        s->nb_overlays = 0;
+        if (s->hwctx) {
+            CUcontext dummy;
+            CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPushCurrent(s->cu_ctx));
+            for (i = 0; i < s->nb_atlases; i++)
+                atlas_free_gpu_locked(s, &s->atlases[i]);
+            s->nb_atlases = 0;
+            CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
+        }
+        pthread_mutex_unlock(&s->overlay_lock);
+        if (res) av_strlcpy(res, "ok", res_len);
+        return 0;
+    }
+
+    if (!strcmp(cmd, "reload_icon")) {
+        /* Invalidate cached icon atlas by name — next render re-reads file from disk.
+         * Used Phase 6 by controller для periodic chart/chat re-rendering. */
+        char name[OVERLAY_ICON_NAME_MAX];
+        int i, found = -1;
+        if (!arg || sscanf(arg, "%31s", name) != 1) {
+            if (res) av_strlcpy(res, "err parse", res_len);
+            return AVERROR(EINVAL);
+        }
+        pthread_mutex_lock(&s->overlay_lock);
+        if (s->hwctx) {
+            CUcontext dummy;
+            CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPushCurrent(s->cu_ctx));
+            for (i = 0; i < s->nb_icon_atlases; i++) {
+                if (!strcmp(s->icon_atlases[i].icon_name, name)) {
+                    if (s->icon_atlases[i].device_ptr) {
+                        s->hwctx->internal->cuda_dl->cuMemFree(s->icon_atlases[i].device_ptr);
+                        s->icon_atlases[i].device_ptr = 0;
+                    }
+                    memmove(&s->icon_atlases[i], &s->icon_atlases[i + 1],
+                            (s->nb_icon_atlases - i - 1) * sizeof(IconAtlas));
+                    s->nb_icon_atlases--;
+                    found = i;
+                    break;
+                }
+            }
+            CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
+        }
+        pthread_mutex_unlock(&s->overlay_lock);
+        if (res) snprintf(res, res_len, found >= 0 ? "ok %s reloaded" : "not_cached %s", name);
+        return 0;
+    }
+
+    /* Future: set_layout (Phase 3 на filter side — пока nb_inputs хардкодится в init) */
+
+    /* Fall back to standard option/command handling */
+    return ff_filter_process_command(ctx, cmd, arg, res, res_len, flags);
+}
+
+static int cuda_grid_config_output(AVFilterLink *outlink)
+{
+    AVFilterContext     *ctx  = outlink->src;
+    CudaGridContext     *s    = ctx->priv;
+    AVFilterLink        *in0  = ctx->inputs[0];
+    FilterLink          *inl0 = ff_filter_link(in0);
+    FilterLink          *outl = ff_filter_link(outlink);
+    AVHWFramesContext   *hwfc0;
+    AVHWDeviceContext   *hwdev;
+    AVBufferRef         *out_ref;
+    AVHWFramesContext   *out_hwfc;
+    int i, ret;
+
+    if (!inl0->hw_frames_ctx)
+        return AVERROR(EINVAL);
+    hwfc0 = (AVHWFramesContext *)inl0->hw_frames_ctx->data;
+
+    if (hwfc0->sw_format != AV_PIX_FMT_NV12) {
+        av_log(ctx, AV_LOG_ERROR,
+               "Phase 1-2a поддерживают только NV12, got %s\n",
+               av_get_pix_fmt_name(hwfc0->sw_format));
+        return AVERROR(EINVAL);
+    }
+
+    /* Compute pixel rects из normalized layout × output size */
+    for (i = 0; i < s->layout->nb_cells; i++) {
+        s->cell_px[i].x = (int)(s->layout->cells[i].x * s->out_width);
+        s->cell_px[i].y = (int)(s->layout->cells[i].y * s->out_height);
+        s->cell_px[i].w = (int)(s->layout->cells[i].w * s->out_width);
+        s->cell_px[i].h = (int)(s->layout->cells[i].h * s->out_height);
+        /* Align до chroma boundary (NV12 → 2x2) */
+        s->cell_px[i].x &= ~1;
+        s->cell_px[i].y &= ~1;
+        s->cell_px[i].w &= ~1;
+        s->cell_px[i].h &= ~1;
+        av_log(ctx, AV_LOG_VERBOSE, "  cell[%d] = %dx%d @ (%d,%d)\n",
+               i, s->cell_px[i].w, s->cell_px[i].h, s->cell_px[i].x, s->cell_px[i].y);
+    }
+
+    /* Validate все inputs: device match + sw_format match */
+    for (i = 1; i < s->layout->nb_cells; i++) {
+        AVFilterLink *inN = ctx->inputs[i];
+        FilterLink   *ilN = ff_filter_link(inN);
+        AVHWFramesContext *hN;
+        if (!ilN->hw_frames_ctx)
+            return AVERROR(EINVAL);
+        hN = (AVHWFramesContext *)ilN->hw_frames_ctx->data;
+        if (hN->device_ctx != hwfc0->device_ctx) {
+            av_log(ctx, AV_LOG_ERROR, "input %d device mismatch\n", i);
+            return AVERROR(EINVAL);
+        }
+        if (hN->sw_format != hwfc0->sw_format) {
+            av_log(ctx, AV_LOG_ERROR, "input %d sw_format mismatch\n", i);
+            return AVERROR(EINVAL);
+        }
+    }
+
+    outlink->w = s->out_width;
+    outlink->h = s->out_height;
+
+    hwdev = hwfc0->device_ctx;
+    s->hwctx    = (AVCUDADeviceContext *)hwdev->hwctx;
+    s->cu_ctx   = s->hwctx->cuda_ctx;
+    s->cu_stream = s->hwctx->stream;
+
+    /* Load CUDA module + resolve kernel handles (Phase 4b-2 alpha kernels) */
+    {
+        CUcontext dummy;
+        ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPushCurrent(s->cu_ctx));
+        if (ret < 0) return ret;
+
+        ret = ff_cuda_load_module(ctx, s->hwctx, &s->cu_module,
+                                  ff_vf_cuda_grid_ptx_data, ff_vf_cuda_grid_ptx_len);
+        if (ret < 0) {
+            CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
+            return ret;
+        }
+
+#define GET_FN(handle, name)                                                                \
+        ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuModuleGetFunction(                    \
+                       &s->handle, s->cu_module, name));                                    \
+        if (ret < 0) {                                                                       \
+            CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));                 \
+            return ret;                                                                      \
+        }
+        GET_FN(cu_func_alpha_fill_y,        "Alpha_Fill_Y");
+        GET_FN(cu_func_alpha_fill_uv,       "Alpha_Fill_UV");
+        GET_FN(cu_func_alpha_blit_rgba_y,   "Alpha_Blit_RGBA_Y");
+        GET_FN(cu_func_alpha_blit_rgba_uv,  "Alpha_Blit_RGBA_UV");
+#undef GET_FN
+
+        CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
+    }
+
+    out_ref = av_hwframe_ctx_alloc(hwfc0->device_ref);
+    if (!out_ref)
+        return AVERROR(ENOMEM);
+    out_hwfc = (AVHWFramesContext *)out_ref->data;
+    out_hwfc->format    = AV_PIX_FMT_CUDA;
+    out_hwfc->sw_format = AV_PIX_FMT_NV12;
+    out_hwfc->width     = s->out_width;
+    out_hwfc->height    = s->out_height;
+    out_hwfc->initial_pool_size = 4;
+
+    ret = av_hwframe_ctx_init(out_ref);
+    if (ret < 0) {
+        av_buffer_unref(&out_ref);
+        return ret;
+    }
+    outl->hw_frames_ctx = out_ref;
+
+    ret = ff_framesync_init(&s->fs, ctx, s->layout->nb_cells);
+    if (ret < 0)
+        return ret;
+    {
+        FFFrameSyncIn *fs_in;
+        for (i = 0; i < s->layout->nb_cells; i++) {
+            fs_in = &s->fs.in[i];
+            fs_in->time_base = ctx->inputs[i]->time_base;
+            fs_in->sync   = 1;
+            fs_in->before = EXT_STOP;
+            fs_in->after  = EXT_INFINITY;
+        }
+    }
+    s->fs.opaque   = s;
+    s->fs.on_event = cuda_grid_compose;
+
+    outlink->time_base = ctx->inputs[0]->time_base;
+    return ff_framesync_configure(&s->fs);
+}
+
+static int cuda_grid_activate(AVFilterContext *ctx)
+{
+    CudaGridContext *s = ctx->priv;
+    return ff_framesync_activate(&s->fs);
+}
+
+/* ─── Options + registration ───────────────────────────────────────────── */
+
+#define OFFSET(x) offsetof(CudaGridContext, x)
+#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
+
+static const AVOption cuda_grid_options[] = {
+    { "layout", "имя layout template",
+      OFFSET(layout_name), AV_OPT_TYPE_STRING, { .str = "quad" }, 0, 0, FLAGS },
+    { "out_w", "ширина output frame в пикселях",
+      OFFSET(out_width),   AV_OPT_TYPE_INT,    { .i64 = 1920 }, 16, 16384, FLAGS },
+    { "out_h", "высота output frame в пикселях",
+      OFFSET(out_height),  AV_OPT_TYPE_INT,    { .i64 = 1080 }, 16, 16384, FLAGS },
+    { "font_file", "TTF/OTF font path (default: search DejaVu/Liberation)",
+      OFFSET(font_file), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, FLAGS },
+    { "font_size", "default text overlay size (px), используется если overlay не указал свой",
+      OFFSET(default_font_size), AV_OPT_TYPE_INT, { .i64 = 24 }, 6, 256, FLAGS },
+    { "icon_dir", "directory для resolve icon_name (по умолчанию /var/lib/cuda-grid/icons)",
+      OFFSET(icon_dir), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, FLAGS },
+    { NULL }
+};
+
+static const AVClass cuda_grid_class = {
+    .class_name = "cuda_grid",
+    .item_name  = av_default_item_name,
+    .option     = cuda_grid_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+    .category   = AV_CLASS_CATEGORY_FILTER,
+};
+
+static const AVFilterPad cuda_grid_outputs[] = {
+    { .name = "default", .type = AVMEDIA_TYPE_VIDEO, .config_props = cuda_grid_config_output },
+};
+
+const AVFilter ff_vf_cuda_grid = {
+    .name            = "cuda_grid",
+    .description     = NULL_IF_CONFIG_SMALL("GPU-native video grid composer (CUDA)."),
+    .priv_class      = &cuda_grid_class,
+    .priv_size       = sizeof(CudaGridContext),
+    .init            = cuda_grid_init,
+    .uninit          = cuda_grid_uninit,
+    .activate        = cuda_grid_activate,
+    .process_command = cuda_grid_process_command,
+    /* No FILTER_INPUTS — pads added dynamically в init() per layout. */
+    FILTER_OUTPUTS(cuda_grid_outputs),
+    FILTER_SINGLE_PIXFMT(AV_PIX_FMT_CUDA),
+    .flags           = AVFILTER_FLAG_HWDEVICE | AVFILTER_FLAG_DYNAMIC_INPUTS,
+    .flags_internal  = FF_FILTER_FLAG_HWFRAME_AWARE,
+};
diff --git a/libavfilter/vf_cuda_grid.cu b/libavfilter/vf_cuda_grid.cu
new file mode 100644
index 0000000..10d2af6
--- /dev/null
+++ b/libavfilter/vf_cuda_grid.cu
@@ -0,0 +1,121 @@
+/*
+ * cuda_grid overlay CUDA kernels.
+ *
+ * Алфа-блендинг poверх NV12 frame:
+ *   - Alpha_Fill_Y / Alpha_Fill_UV: solid colour fill region (rect/dim)
+ *   - Alpha_Blit_RGBA_Y / Alpha_Blit_RGBA_UV: blit RGBA atlas → NV12
+ *     (для text Phase 4b-3 и icon Phase 4b-4)
+ *
+ * BT.709 limited-range conversion (HDTV). См. также vf_cuda_grid.c rgb_to_yuv709.
+ *
+ * Лицензия: LGPL-2.1+ (соответствует FFmpeg)
+ */
+
+#include "cuda/vector_helpers.cuh"
+
+extern "C" {
+
+/* Solid colour α-blend на Y plane.
+ *   dst: pointer на Y plane base
+ *   dst_pitch: bytes per row
+ *   rx, ry, rw, rh: region rect (pixels, must be in-bounds)
+ *   fill: 0..255 (Y component to blend in)
+ *   alpha: 0..255 (255 = fully opaque)
+ */
+__global__ void Alpha_Fill_Y(unsigned char *dst, int dst_pitch,
+                             int rx, int ry, int rw, int rh,
+                             int fill, int alpha)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= rw || y >= rh) return;
+    unsigned char *p = dst + (ry + y) * dst_pitch + (rx + x);
+    int cur = *p;
+    *p = (unsigned char)((fill * alpha + cur * (255 - alpha)) / 255);
+}
+
+/* Solid colour α-blend на UV plane (NV12 interleaved).
+ *   rx, ry: in chroma plane coords (= full-res Y x/2, y/2)
+ *   rw, rh: also в chroma coords
+ */
+__global__ void Alpha_Fill_UV(unsigned char *dst, int dst_pitch,
+                              int rx, int ry, int rw, int rh,
+                              int fill_u, int fill_v, int alpha)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= rw || y >= rh) return;
+    unsigned char *p = dst + (ry + y) * dst_pitch + (rx + x) * 2;
+    int cu = p[0], cv = p[1];
+    p[0] = (unsigned char)((fill_u * alpha + cu * (255 - alpha)) / 255);
+    p[1] = (unsigned char)((fill_v * alpha + cv * (255 - alpha)) / 255);
+}
+
+/* Blit RGBA atlas → Y plane с α-blending.
+ *   dst, dst_pitch: Y plane
+ *   dx, dy: destination pixel offset (full-res coords)
+ *   atlas, atlas_pitch: RGBA source (interleaved R,G,B,A bytes), pitch в bytes
+ *   w, h: atlas dimensions (in pixels)
+ *   extra_alpha: дополнительный множитель (0..255) — overlay-level opacity
+ */
+__global__ void Alpha_Blit_RGBA_Y(unsigned char *dst, int dst_pitch,
+                                  int dx, int dy,
+                                  const unsigned char *atlas, int atlas_pitch,
+                                  int w, int h, int extra_alpha)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= w || y >= h) return;
+    const unsigned char *sp = atlas + y * atlas_pitch + x * 4;
+    int r = sp[0], g = sp[1], b = sp[2], a = sp[3];
+    a = a * extra_alpha / 255;
+    if (a == 0) return;
+    int Y = (int)(0.183f * r + 0.614f * g + 0.062f * b) + 16;
+    Y = Y < 0 ? 0 : (Y > 255 ? 255 : Y);
+    unsigned char *p = dst + (dy + y) * dst_pitch + (dx + x);
+    int cur = *p;
+    *p = (unsigned char)((Y * a + cur * (255 - a)) / 255);
+}
+
+/* Blit RGBA atlas → UV plane с 4:2:0 chroma subsampling.
+ *   dst, dst_pitch: UV plane
+ *   dx, dy: destination pixel offset (full-res coords; обе должны быть кратны 2)
+ *   atlas, atlas_pitch: RGBA source
+ *   w, h: atlas dimensions (full-res; UV operates на w/2 × h/2)
+ *   extra_alpha: 0..255
+ */
+__global__ void Alpha_Blit_RGBA_UV(unsigned char *dst, int dst_pitch,
+                                   int dx, int dy,
+                                   const unsigned char *atlas, int atlas_pitch,
+                                   int w, int h, int extra_alpha)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    int hw = w / 2, hh = h / 2;
+    if (x >= hw || y >= hh) return;
+
+    int sx = x * 2, sy = y * 2;
+    const unsigned char *row0 = atlas + sy * atlas_pitch;
+    const unsigned char *row1 = atlas + (sy + 1) * atlas_pitch;
+
+    int r = (row0[sx*4+0] + row0[(sx+1)*4+0] + row1[sx*4+0] + row1[(sx+1)*4+0]) >> 2;
+    int g = (row0[sx*4+1] + row0[(sx+1)*4+1] + row1[sx*4+1] + row1[(sx+1)*4+1]) >> 2;
+    int b = (row0[sx*4+2] + row0[(sx+1)*4+2] + row1[sx*4+2] + row1[(sx+1)*4+2]) >> 2;
+    int a = (row0[sx*4+3] + row0[(sx+1)*4+3] + row1[sx*4+3] + row1[(sx+1)*4+3]) >> 2;
+    a = a * extra_alpha / 255;
+    if (a == 0) return;
+
+    int U = (int)(-0.101f * r - 0.339f * g + 0.439f * b) + 128;
+    int V = (int)( 0.439f * r - 0.399f * g - 0.040f * b) + 128;
+    U = U < 0 ? 0 : (U > 255 ? 255 : U);
+    V = V < 0 ? 0 : (V > 255 ? 255 : V);
+
+    int du = dx / 2 + x;
+    int dv = dy / 2 + y;
+    unsigned char *p = dst + dv * dst_pitch + du * 2;
+    int cu = p[0], cv = p[1];
+    p[0] = (unsigned char)((U * a + cu * (255 - a)) / 255);
+    p[1] = (unsigned char)((V * a + cv * (255 - a)) / 255);
+}
+
+}  /* extern "C" */