vf_cuda_grid: Phase 4b-1 — rect overlay primitives (solid fill, no alpha)

Добавляет inner overlay state с mutex + process_command handler. Rendering filled/border rects через cuMemsetD2D8Async/D2D16Async — без custom kernel'а (Phase 4b-2 = alpha blend, требует .cu). Commands: add_overlay <id> rect cell=N x=.. y=.. w=.. h=.. r=.. g=.. b=.. thickness=.. opacity=.. remove_overlay <id> clear_overlays text/icon/dim — типы определены, render заглушен до Phase 4b-2/3/4. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-19 22:17:41 +01:00
parent 178fc5bb4e
commit 9deaca7697
1 changed files with 391 additions and 11 deletions
@@ -40,6 +40,7 @@
 #include "libavutil/mem.h"
 #include "libavutil/opt.h"
 #include "libavutil/pixdesc.h"
+#include "libavutil/thread.h"

 #include "avfilter.h"
 #include "filters.h"
@@ -48,7 +49,46 @@
 #include "video.h"

 #define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x)
-#define MAX_CELLS 16
+#define MAX_CELLS    16
+#define MAX_OVERLAYS 64
+#define OVERLAY_ID_MAX 32
+#define OVERLAY_TEXT_MAX 128
+
+/* ─── Overlay primitives (Phase 4b-1: rect only, no alpha) ─────────────── */
+
+typedef enum {
+    OV_TYPE_RECT = 0,
+    OV_TYPE_TEXT,   /* Phase 4b-3 */
+    OV_TYPE_ICON,   /* Phase 4b-4 */
+    OV_TYPE_DIM,    /* Phase 4b-2 (alpha kernel) */
+} GridOverlayType;
+
+typedef struct GridOverlay {
+    char            id[OVERLAY_ID_MAX];
+    GridOverlayType type;
+    int             cell;       /* -1 = absolute on output, иначе cell-relative */
+    float           x, y, w, h; /* [0..1] normalized; relative к cell или output */
+    int             z_order;    /* меньше → рисуется первым */
+    uint8_t         opacity;    /* 0..255, реально применяется в Phase 4b-2 */
+    int             visible;
+    union {
+        struct {
+            uint8_t r, g, b;
+            int     thickness;  /* 0 = filled, >0 = border (border via 4 strips) */
+        } rect;
+        struct {
+            char    text[OVERLAY_TEXT_MAX];
+            int     font_size;
+            uint8_t r, g, b;
+        } text;
+        struct {
+            char    icon_name[32];
+        } icon;
+        struct {
+            uint8_t amount; /* 0..255, alpha for dim */
+        } dim;
+    } u;
+} GridOverlay;

 /* ─── Layout templates (normalized координаты 0.0–1.0) ─────────────────── */

@@ -157,6 +197,12 @@ typedef struct CudaGridContext {
    struct {
        int x, y, w, h;
    } cell_px[MAX_CELLS];
+
+    /* Overlay state (Phase 4b) — mutex-guarded, process_command/render thread-safe */
+    GridOverlay     overlays[MAX_OVERLAYS];
+    int             nb_overlays;
+    pthread_mutex_t overlay_lock;
+    int             overlay_lock_inited;
 } CudaGridContext;

 /* ─── Composition: copy одного input plane в target region output ──────── */
@@ -188,6 +234,272 @@ static int copy_input_plane(AVFilterContext *ctx,
    return CHECK_CU(s->hwctx->internal->cuda_dl->cuMemcpy2DAsync(&cpy, s->cu_stream));
 }

+/* ─── Overlay parsing + rendering (Phase 4b-1: rect, solid fill, no alpha) ─ */
+
+/* BT.709 limited-range RGB → YUV. Достаточно для HDTV (любое 1920×1080 output). */
+static av_always_inline void rgb_to_yuv709(uint8_t r, uint8_t g, uint8_t b,
+                                            uint8_t *Y, uint8_t *U, uint8_t *V)
+{
+    int y = (int)( 0.183f * r + 0.614f * g + 0.062f * b) + 16;
+    int u = (int)(-0.101f * r - 0.339f * g + 0.439f * b) + 128;
+    int v = (int)( 0.439f * r - 0.399f * g - 0.040f * b) + 128;
+    *Y = av_clip_uint8(y);
+    *U = av_clip_uint8(u);
+    *V = av_clip_uint8(v);
+}
+
+/* Parse args в формате: "<id> <type> <key>=<val> <key>=<val> ..."
+ * Out *ov заполняется defaults + parsed values. Returns 0 / AVERROR. */
+static int parse_overlay_args(AVFilterContext *ctx, const char *args, GridOverlay *ov)
+{
+    char id[OVERLAY_ID_MAX], type_str[16];
+    const char *p;
+
+    if (!args)
+        return AVERROR(EINVAL);
+
+    if (sscanf(args, "%31s %15s", id, type_str) != 2) {
+        av_log(ctx, AV_LOG_ERROR, "overlay args: expected '<id> <type> ...', got: %s\n", args);
+        return AVERROR(EINVAL);
+    }
+
+    memset(ov, 0, sizeof(*ov));
+    av_strlcpy(ov->id, id, sizeof(ov->id));
+
+    if      (!strcmp(type_str, "rect")) ov->type = OV_TYPE_RECT;
+    else if (!strcmp(type_str, "text")) ov->type = OV_TYPE_TEXT;
+    else if (!strcmp(type_str, "icon")) ov->type = OV_TYPE_ICON;
+    else if (!strcmp(type_str, "dim"))  ov->type = OV_TYPE_DIM;
+    else {
+        av_log(ctx, AV_LOG_ERROR, "unknown overlay type: %s\n", type_str);
+        return AVERROR(EINVAL);
+    }
+
+    /* Defaults */
+    ov->cell    = -1;
+    ov->opacity = 255;
+    ov->visible = 1;
+    if (ov->type == OV_TYPE_RECT)
+        ov->u.rect.thickness = 0;  /* filled */
+    if (ov->type == OV_TYPE_DIM)
+        ov->u.dim.amount = 128;
+
+    /* Advance past <id> и <type> */
+    p = strchr(args, ' ');
+    if (!p) return 0;
+    while (*p == ' ') p++;
+    p = strchr(p, ' ');
+    if (!p) return 0;
+    while (*p == ' ') p++;
+
+    while (*p) {
+        char key[32], val[OVERLAY_TEXT_MAX];
+        int n = 0;
+        if (sscanf(p, "%31[^= \t]=%127s%n", key, val, &n) < 2)
+            break;
+        p += n;
+        while (*p == ' ' || *p == '\t') p++;
+
+        if      (!strcmp(key, "cell"))    ov->cell    = atoi(val);
+        else if (!strcmp(key, "x"))       ov->x       = (float)atof(val);
+        else if (!strcmp(key, "y"))       ov->y       = (float)atof(val);
+        else if (!strcmp(key, "w"))       ov->w       = (float)atof(val);
+        else if (!strcmp(key, "h"))       ov->h       = (float)atof(val);
+        else if (!strcmp(key, "z_order") || !strcmp(key, "z"))
+                                          ov->z_order = atoi(val);
+        else if (!strcmp(key, "opacity")) ov->opacity = av_clip(atoi(val), 0, 255);
+        else if (!strcmp(key, "visible")) ov->visible = atoi(val) ? 1 : 0;
+        else if (ov->type == OV_TYPE_RECT) {
+            if      (!strcmp(key, "r"))         ov->u.rect.r = av_clip_uint8(atoi(val));
+            else if (!strcmp(key, "g"))         ov->u.rect.g = av_clip_uint8(atoi(val));
+            else if (!strcmp(key, "b"))         ov->u.rect.b = av_clip_uint8(atoi(val));
+            else if (!strcmp(key, "thickness")) ov->u.rect.thickness = atoi(val);
+        } else if (ov->type == OV_TYPE_TEXT) {
+            if      (!strcmp(key, "text"))      av_strlcpy(ov->u.text.text, val, sizeof(ov->u.text.text));
+            else if (!strcmp(key, "font_size")) ov->u.text.font_size = atoi(val);
+            else if (!strcmp(key, "r"))         ov->u.text.r = av_clip_uint8(atoi(val));
+            else if (!strcmp(key, "g"))         ov->u.text.g = av_clip_uint8(atoi(val));
+            else if (!strcmp(key, "b"))         ov->u.text.b = av_clip_uint8(atoi(val));
+        } else if (ov->type == OV_TYPE_ICON) {
+            if (!strcmp(key, "icon_name")) av_strlcpy(ov->u.icon.icon_name, val, sizeof(ov->u.icon.icon_name));
+        } else if (ov->type == OV_TYPE_DIM) {
+            if (!strcmp(key, "amount")) ov->u.dim.amount = av_clip_uint8(atoi(val));
+        }
+    }
+    return 0;
+}
+
+/* Lock must be held by caller. */
+static int overlay_upsert_locked(CudaGridContext *s, const GridOverlay *ov)
+{
+    int i;
+    for (i = 0; i < s->nb_overlays; i++) {
+        if (!strcmp(s->overlays[i].id, ov->id)) {
+            s->overlays[i] = *ov;
+            return 0;
+        }
+    }
+    if (s->nb_overlays >= MAX_OVERLAYS)
+        return AVERROR(ENOSPC);
+    s->overlays[s->nb_overlays++] = *ov;
+    return 0;
+}
+
+static int overlay_remove_locked(CudaGridContext *s, const char *id)
+{
+    int i;
+    for (i = 0; i < s->nb_overlays; i++) {
+        if (!strcmp(s->overlays[i].id, id)) {
+            memmove(&s->overlays[i], &s->overlays[i + 1],
+                    (s->nb_overlays - i - 1) * sizeof(GridOverlay));
+            s->nb_overlays--;
+            return 0;
+        }
+    }
+    return AVERROR(ENOENT);
+}
+
+/* Compute pixel rect для overlay: либо cell-relative, либо absolute.
+ * Clips против output bounds + 2px alignment для NV12 chroma. */
+static int overlay_pixel_rect(CudaGridContext *s, const GridOverlay *ov,
+                              int *out_x, int *out_y, int *out_w, int *out_h)
+{
+    int rx, ry, rw, rh;
+    int base_x, base_y, base_w, base_h;
+
+    if (ov->cell < 0) {
+        base_x = 0;
+        base_y = 0;
+        base_w = s->out_width;
+        base_h = s->out_height;
+    } else if (ov->cell < s->layout->nb_cells) {
+        base_x = s->cell_px[ov->cell].x;
+        base_y = s->cell_px[ov->cell].y;
+        base_w = s->cell_px[ov->cell].w;
+        base_h = s->cell_px[ov->cell].h;
+    } else {
+        return AVERROR(EINVAL);
+    }
+
+    rx = base_x + (int)(ov->x * base_w);
+    ry = base_y + (int)(ov->y * base_h);
+    rw = (int)(ov->w * base_w);
+    rh = (int)(ov->h * base_h);
+
+    if (rx < 0)             { rw += rx; rx = 0; }
+    if (ry < 0)             { rh += ry; ry = 0; }
+    if (rx + rw > s->out_width)  rw = s->out_width  - rx;
+    if (ry + rh > s->out_height) rh = s->out_height - ry;
+    rx &= ~1; ry &= ~1; rw &= ~1; rh &= ~1;
+
+    *out_x = rx; *out_y = ry; *out_w = rw; *out_h = rh;
+    return (rw > 0 && rh > 0) ? 0 : 1;  /* 1 → empty, skip */
+}
+
+/* Solid filled strip (no alpha). cu ctx must be pushed. */
+static int render_strip_solid(AVFilterContext *ctx, AVFrame *out,
+                              int x, int y, int w, int h,
+                              uint8_t Y, uint8_t U, uint8_t V)
+{
+    CudaGridContext *s = ctx->priv;
+    CUdeviceptr dst_y, dst_uv;
+    unsigned short uv;
+    int ret;
+
+    x &= ~1; y &= ~1; w &= ~1; h &= ~1;
+    if (w <= 0 || h <= 0) return 0;
+
+    dst_y = (CUdeviceptr)(out->data[0] + (size_t)y * out->linesize[0] + x);
+    ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuMemsetD2D8Async(
+        dst_y, out->linesize[0], Y, (size_t)w, (size_t)h, s->cu_stream));
+    if (ret < 0) return ret;
+
+    /* NV12 UV interleaved, half res. cuMemsetD2D16Async sets 16-bit element
+     * = UV pair. Little-endian: low byte=U, high byte=V. */
+    uv = (unsigned short)U | ((unsigned short)V << 8);
+    dst_uv = (CUdeviceptr)(out->data[1] + (size_t)(y / 2) * out->linesize[1] + x);
+    ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuMemsetD2D16Async(
+        dst_uv, out->linesize[1], uv, (size_t)(w / 2), (size_t)(h / 2), s->cu_stream));
+    return ret;
+}
+
+static int render_overlay_rect(AVFilterContext *ctx, AVFrame *out, const GridOverlay *ov)
+{
+    CudaGridContext *s = ctx->priv;
+    int px, py, pw, ph;
+    uint8_t Y, U, V;
+    int ret;
+
+    ret = overlay_pixel_rect(s, ov, &px, &py, &pw, &ph);
+    if (ret != 0) return ret < 0 ? ret : 0;
+
+    rgb_to_yuv709(ov->u.rect.r, ov->u.rect.g, ov->u.rect.b, &Y, &U, &V);
+
+    if (ov->u.rect.thickness <= 0) {
+        /* Filled */
+        return render_strip_solid(ctx, out, px, py, pw, ph, Y, U, V);
+    } else {
+        int t = ov->u.rect.thickness;
+        t = FFMIN(t, FFMIN(pw / 2, ph / 2));
+        if (t < 2) t = 2;
+        t &= ~1;
+        /* Top */
+        ret = render_strip_solid(ctx, out, px, py, pw, t, Y, U, V);
+        if (ret < 0) return ret;
+        /* Bottom */
+        ret = render_strip_solid(ctx, out, px, py + ph - t, pw, t, Y, U, V);
+        if (ret < 0) return ret;
+        /* Left */
+        ret = render_strip_solid(ctx, out, px, py + t, t, ph - 2 * t, Y, U, V);
+        if (ret < 0) return ret;
+        /* Right */
+        ret = render_strip_solid(ctx, out, px + pw - t, py + t, t, ph - 2 * t, Y, U, V);
+        return ret;
+    }
+}
+
+/* Render all visible overlays. cu ctx must be pushed by caller. */
+static int render_overlays(AVFilterContext *ctx, AVFrame *out)
+{
+    CudaGridContext *s = ctx->priv;
+    GridOverlay sorted[MAX_OVERLAYS];
+    int i, j, n, ret;
+
+    pthread_mutex_lock(&s->overlay_lock);
+    n = s->nb_overlays;
+    memcpy(sorted, s->overlays, (size_t)n * sizeof(GridOverlay));
+    pthread_mutex_unlock(&s->overlay_lock);
+
+    /* Insertion sort by z_order (stable). n ≤ 64 → fine. */
+    for (i = 1; i < n; i++) {
+        GridOverlay tmp = sorted[i];
+        for (j = i; j > 0 && sorted[j - 1].z_order > tmp.z_order; j--)
+            sorted[j] = sorted[j - 1];
+        sorted[j] = tmp;
+    }
+
+    for (i = 0; i < n; i++) {
+        const GridOverlay *ov = &sorted[i];
+        if (!ov->visible) continue;
+        switch (ov->type) {
+        case OV_TYPE_RECT:
+            ret = render_overlay_rect(ctx, out, ov);
+            if (ret < 0) return ret;
+            break;
+        case OV_TYPE_DIM:
+            av_log(ctx, AV_LOG_TRACE, "overlay %s: dim type — Phase 4b-2 (alpha kernel)\n", ov->id);
+            break;
+        case OV_TYPE_TEXT:
+            av_log(ctx, AV_LOG_TRACE, "overlay %s: text — Phase 4b-3 (freetype)\n", ov->id);
+            break;
+        case OV_TYPE_ICON:
+            av_log(ctx, AV_LOG_TRACE, "overlay %s: icon — Phase 4b-4 (sprite)\n", ov->id);
+            break;
+        }
+    }
+    return 0;
+}
+
 /* ─── Framesync callback ──────────────────────────────────────────────── */

 static int cuda_grid_compose(FFFrameSync *fs)
@@ -267,6 +579,13 @@ static int cuda_grid_compose(FFFrameSync *fs)
        }
    }

+    /* Overlay pass (Phase 4b) */
+    ret = render_overlays(ctx, out);
+    if (ret < 0) {
+        CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
+        goto fail;
+    }
+
    CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
    return ff_filter_frame(outlink, out);

@@ -305,7 +624,15 @@ static av_cold int cuda_grid_init(AVFilterContext *ctx)
            return ret;
    }

-    av_log(ctx, AV_LOG_INFO, "cuda_grid layout=%s cells=%d output=%dx%d\n",
+    /* Overlay mutex */
+    ret = pthread_mutex_init(&s->overlay_lock, NULL);
+    if (ret) {
+        av_log(ctx, AV_LOG_ERROR, "overlay_lock init failed: %d\n", ret);
+        return AVERROR(ENOMEM);
+    }
+    s->overlay_lock_inited = 1;
+
+    av_log(ctx, AV_LOG_INFO, "cuda_grid layout=%s cells=%d output=%dx%d (overlays: rect)\n",
           lt->name, lt->nb_cells, s->out_width, s->out_height);
    return 0;
 }
@@ -314,6 +641,58 @@ static av_cold void cuda_grid_uninit(AVFilterContext *ctx)
 {
    CudaGridContext *s = ctx->priv;
    ff_framesync_uninit(&s->fs);
+    if (s->overlay_lock_inited) {
+        pthread_mutex_destroy(&s->overlay_lock);
+        s->overlay_lock_inited = 0;
+    }
+}
+
+static int cuda_grid_process_command(AVFilterContext *ctx, const char *cmd,
+                                     const char *arg, char *res, int res_len, int flags)
+{
+    CudaGridContext *s = ctx->priv;
+    int ret;
+
+    if (!strcmp(cmd, "add_overlay")) {
+        GridOverlay ov;
+        ret = parse_overlay_args(ctx, arg, &ov);
+        if (ret < 0) {
+            if (res) av_strlcpy(res, "err parse", res_len);
+            return ret;
+        }
+        pthread_mutex_lock(&s->overlay_lock);
+        ret = overlay_upsert_locked(s, &ov);
+        pthread_mutex_unlock(&s->overlay_lock);
+        if (res) snprintf(res, res_len, ret == 0 ? "ok id=%s n=%d" : "err full id=%s",
+                          ov.id, s->nb_overlays);
+        return ret;
+    }
+
+    if (!strcmp(cmd, "remove_overlay")) {
+        char id[OVERLAY_ID_MAX];
+        if (!arg || sscanf(arg, "%31s", id) != 1) {
+            if (res) av_strlcpy(res, "err parse", res_len);
+            return AVERROR(EINVAL);
+        }
+        pthread_mutex_lock(&s->overlay_lock);
+        ret = overlay_remove_locked(s, id);
+        pthread_mutex_unlock(&s->overlay_lock);
+        if (res) snprintf(res, res_len, ret == 0 ? "ok id=%s" : "not_found id=%s", id);
+        return ret;
+    }
+
+    if (!strcmp(cmd, "clear_overlays")) {
+        pthread_mutex_lock(&s->overlay_lock);
+        s->nb_overlays = 0;
+        pthread_mutex_unlock(&s->overlay_lock);
+        if (res) av_strlcpy(res, "ok", res_len);
+        return 0;
+    }
+
+    /* Future: set_layout (Phase 3 на filter side — пока nb_inputs хардкодится в init) */
+
+    /* Fall back to standard option/command handling */
+    return ff_filter_process_command(ctx, cmd, arg, res, res_len, flags);
 }

 static int cuda_grid_config_input(AVFilterLink *inlink)
@@ -465,16 +844,17 @@ static const AVFilterPad cuda_grid_outputs[] = {
 };

 const AVFilter ff_vf_cuda_grid = {
-    .name           = "cuda_grid",
-    .description    = NULL_IF_CONFIG_SMALL("GPU-native video grid composer (CUDA)."),
-    .priv_class     = &cuda_grid_class,
-    .priv_size      = sizeof(CudaGridContext),
-    .init           = cuda_grid_init,
-    .uninit         = cuda_grid_uninit,
-    .activate       = cuda_grid_activate,
+    .name            = "cuda_grid",
+    .description     = NULL_IF_CONFIG_SMALL("GPU-native video grid composer (CUDA)."),
+    .priv_class      = &cuda_grid_class,
+    .priv_size       = sizeof(CudaGridContext),
+    .init            = cuda_grid_init,
+    .uninit          = cuda_grid_uninit,
+    .activate        = cuda_grid_activate,
+    .process_command = cuda_grid_process_command,
    /* No FILTER_INPUTS — pads added dynamically в init() per layout. */
    FILTER_OUTPUTS(cuda_grid_outputs),
    FILTER_SINGLE_PIXFMT(AV_PIX_FMT_CUDA),
-    .flags          = AVFILTER_FLAG_HWDEVICE | AVFILTER_FLAG_DYNAMIC_INPUTS,
-    .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
+    .flags           = AVFILTER_FLAG_HWDEVICE | AVFILTER_FLAG_DYNAMIC_INPUTS,
+    .flags_internal  = FF_FILTER_FLAG_HWFRAME_AWARE,
 };