From 9deaca769710c88d227316f615e241d73ec2f483 Mon Sep 17 00:00:00 2001 From: gx Date: Tue, 19 May 2026 22:17:41 +0100 Subject: [PATCH] =?UTF-8?q?vf=5Fcuda=5Fgrid:=20Phase=204b-1=20=E2=80=94=20?= =?UTF-8?q?rect=20overlay=20primitives=20(solid=20fill,=20no=20alpha)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Добавляет inner overlay state с mutex + process_command handler. Rendering filled/border rects через cuMemsetD2D8Async/D2D16Async — без custom kernel'а (Phase 4b-2 = alpha blend, требует .cu). Commands: add_overlay rect cell=N x=.. y=.. w=.. h=.. r=.. g=.. b=.. thickness=.. opacity=.. remove_overlay clear_overlays text/icon/dim — типы определены, render заглушен до Phase 4b-2/3/4. Co-Authored-By: Claude Opus 4.7 --- libavfilter/vf_cuda_grid.c | 402 ++++++++++++++++++++++++++++++++++++- 1 file changed, 391 insertions(+), 11 deletions(-) diff --git a/libavfilter/vf_cuda_grid.c b/libavfilter/vf_cuda_grid.c index eca57f0..9f5cc51 100644 --- a/libavfilter/vf_cuda_grid.c +++ b/libavfilter/vf_cuda_grid.c @@ -40,6 +40,7 @@ #include "libavutil/mem.h" #include "libavutil/opt.h" #include "libavutil/pixdesc.h" +#include "libavutil/thread.h" #include "avfilter.h" #include "filters.h" @@ -48,7 +49,46 @@ #include "video.h" #define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x) -#define MAX_CELLS 16 +#define MAX_CELLS 16 +#define MAX_OVERLAYS 64 +#define OVERLAY_ID_MAX 32 +#define OVERLAY_TEXT_MAX 128 + +/* ─── Overlay primitives (Phase 4b-1: rect only, no alpha) ─────────────── */ + +typedef enum { + OV_TYPE_RECT = 0, + OV_TYPE_TEXT, /* Phase 4b-3 */ + OV_TYPE_ICON, /* Phase 4b-4 */ + OV_TYPE_DIM, /* Phase 4b-2 (alpha kernel) */ +} GridOverlayType; + +typedef struct GridOverlay { + char id[OVERLAY_ID_MAX]; + GridOverlayType type; + int cell; /* -1 = absolute on output, иначе cell-relative */ + float x, y, w, h; /* [0..1] normalized; relative к cell или output */ + int z_order; /* меньше → рисуется первым */ + uint8_t opacity; /* 0..255, реально применяется в Phase 4b-2 */ + int visible; + union { + struct { + uint8_t r, g, b; + int thickness; /* 0 = filled, >0 = border (border via 4 strips) */ + } rect; + struct { + char text[OVERLAY_TEXT_MAX]; + int font_size; + uint8_t r, g, b; + } text; + struct { + char icon_name[32]; + } icon; + struct { + uint8_t amount; /* 0..255, alpha for dim */ + } dim; + } u; +} GridOverlay; /* ─── Layout templates (normalized координаты 0.0–1.0) ─────────────────── */ @@ -157,6 +197,12 @@ typedef struct CudaGridContext { struct { int x, y, w, h; } cell_px[MAX_CELLS]; + + /* Overlay state (Phase 4b) — mutex-guarded, process_command/render thread-safe */ + GridOverlay overlays[MAX_OVERLAYS]; + int nb_overlays; + pthread_mutex_t overlay_lock; + int overlay_lock_inited; } CudaGridContext; /* ─── Composition: copy одного input plane в target region output ──────── */ @@ -188,6 +234,272 @@ static int copy_input_plane(AVFilterContext *ctx, return CHECK_CU(s->hwctx->internal->cuda_dl->cuMemcpy2DAsync(&cpy, s->cu_stream)); } +/* ─── Overlay parsing + rendering (Phase 4b-1: rect, solid fill, no alpha) ─ */ + +/* BT.709 limited-range RGB → YUV. Достаточно для HDTV (любое 1920×1080 output). */ +static av_always_inline void rgb_to_yuv709(uint8_t r, uint8_t g, uint8_t b, + uint8_t *Y, uint8_t *U, uint8_t *V) +{ + int y = (int)( 0.183f * r + 0.614f * g + 0.062f * b) + 16; + int u = (int)(-0.101f * r - 0.339f * g + 0.439f * b) + 128; + int v = (int)( 0.439f * r - 0.399f * g - 0.040f * b) + 128; + *Y = av_clip_uint8(y); + *U = av_clip_uint8(u); + *V = av_clip_uint8(v); +} + +/* Parse args в формате: " = = ..." + * Out *ov заполняется defaults + parsed values. Returns 0 / AVERROR. */ +static int parse_overlay_args(AVFilterContext *ctx, const char *args, GridOverlay *ov) +{ + char id[OVERLAY_ID_MAX], type_str[16]; + const char *p; + + if (!args) + return AVERROR(EINVAL); + + if (sscanf(args, "%31s %15s", id, type_str) != 2) { + av_log(ctx, AV_LOG_ERROR, "overlay args: expected ' ...', got: %s\n", args); + return AVERROR(EINVAL); + } + + memset(ov, 0, sizeof(*ov)); + av_strlcpy(ov->id, id, sizeof(ov->id)); + + if (!strcmp(type_str, "rect")) ov->type = OV_TYPE_RECT; + else if (!strcmp(type_str, "text")) ov->type = OV_TYPE_TEXT; + else if (!strcmp(type_str, "icon")) ov->type = OV_TYPE_ICON; + else if (!strcmp(type_str, "dim")) ov->type = OV_TYPE_DIM; + else { + av_log(ctx, AV_LOG_ERROR, "unknown overlay type: %s\n", type_str); + return AVERROR(EINVAL); + } + + /* Defaults */ + ov->cell = -1; + ov->opacity = 255; + ov->visible = 1; + if (ov->type == OV_TYPE_RECT) + ov->u.rect.thickness = 0; /* filled */ + if (ov->type == OV_TYPE_DIM) + ov->u.dim.amount = 128; + + /* Advance past и */ + p = strchr(args, ' '); + if (!p) return 0; + while (*p == ' ') p++; + p = strchr(p, ' '); + if (!p) return 0; + while (*p == ' ') p++; + + while (*p) { + char key[32], val[OVERLAY_TEXT_MAX]; + int n = 0; + if (sscanf(p, "%31[^= \t]=%127s%n", key, val, &n) < 2) + break; + p += n; + while (*p == ' ' || *p == '\t') p++; + + if (!strcmp(key, "cell")) ov->cell = atoi(val); + else if (!strcmp(key, "x")) ov->x = (float)atof(val); + else if (!strcmp(key, "y")) ov->y = (float)atof(val); + else if (!strcmp(key, "w")) ov->w = (float)atof(val); + else if (!strcmp(key, "h")) ov->h = (float)atof(val); + else if (!strcmp(key, "z_order") || !strcmp(key, "z")) + ov->z_order = atoi(val); + else if (!strcmp(key, "opacity")) ov->opacity = av_clip(atoi(val), 0, 255); + else if (!strcmp(key, "visible")) ov->visible = atoi(val) ? 1 : 0; + else if (ov->type == OV_TYPE_RECT) { + if (!strcmp(key, "r")) ov->u.rect.r = av_clip_uint8(atoi(val)); + else if (!strcmp(key, "g")) ov->u.rect.g = av_clip_uint8(atoi(val)); + else if (!strcmp(key, "b")) ov->u.rect.b = av_clip_uint8(atoi(val)); + else if (!strcmp(key, "thickness")) ov->u.rect.thickness = atoi(val); + } else if (ov->type == OV_TYPE_TEXT) { + if (!strcmp(key, "text")) av_strlcpy(ov->u.text.text, val, sizeof(ov->u.text.text)); + else if (!strcmp(key, "font_size")) ov->u.text.font_size = atoi(val); + else if (!strcmp(key, "r")) ov->u.text.r = av_clip_uint8(atoi(val)); + else if (!strcmp(key, "g")) ov->u.text.g = av_clip_uint8(atoi(val)); + else if (!strcmp(key, "b")) ov->u.text.b = av_clip_uint8(atoi(val)); + } else if (ov->type == OV_TYPE_ICON) { + if (!strcmp(key, "icon_name")) av_strlcpy(ov->u.icon.icon_name, val, sizeof(ov->u.icon.icon_name)); + } else if (ov->type == OV_TYPE_DIM) { + if (!strcmp(key, "amount")) ov->u.dim.amount = av_clip_uint8(atoi(val)); + } + } + return 0; +} + +/* Lock must be held by caller. */ +static int overlay_upsert_locked(CudaGridContext *s, const GridOverlay *ov) +{ + int i; + for (i = 0; i < s->nb_overlays; i++) { + if (!strcmp(s->overlays[i].id, ov->id)) { + s->overlays[i] = *ov; + return 0; + } + } + if (s->nb_overlays >= MAX_OVERLAYS) + return AVERROR(ENOSPC); + s->overlays[s->nb_overlays++] = *ov; + return 0; +} + +static int overlay_remove_locked(CudaGridContext *s, const char *id) +{ + int i; + for (i = 0; i < s->nb_overlays; i++) { + if (!strcmp(s->overlays[i].id, id)) { + memmove(&s->overlays[i], &s->overlays[i + 1], + (s->nb_overlays - i - 1) * sizeof(GridOverlay)); + s->nb_overlays--; + return 0; + } + } + return AVERROR(ENOENT); +} + +/* Compute pixel rect для overlay: либо cell-relative, либо absolute. + * Clips против output bounds + 2px alignment для NV12 chroma. */ +static int overlay_pixel_rect(CudaGridContext *s, const GridOverlay *ov, + int *out_x, int *out_y, int *out_w, int *out_h) +{ + int rx, ry, rw, rh; + int base_x, base_y, base_w, base_h; + + if (ov->cell < 0) { + base_x = 0; + base_y = 0; + base_w = s->out_width; + base_h = s->out_height; + } else if (ov->cell < s->layout->nb_cells) { + base_x = s->cell_px[ov->cell].x; + base_y = s->cell_px[ov->cell].y; + base_w = s->cell_px[ov->cell].w; + base_h = s->cell_px[ov->cell].h; + } else { + return AVERROR(EINVAL); + } + + rx = base_x + (int)(ov->x * base_w); + ry = base_y + (int)(ov->y * base_h); + rw = (int)(ov->w * base_w); + rh = (int)(ov->h * base_h); + + if (rx < 0) { rw += rx; rx = 0; } + if (ry < 0) { rh += ry; ry = 0; } + if (rx + rw > s->out_width) rw = s->out_width - rx; + if (ry + rh > s->out_height) rh = s->out_height - ry; + rx &= ~1; ry &= ~1; rw &= ~1; rh &= ~1; + + *out_x = rx; *out_y = ry; *out_w = rw; *out_h = rh; + return (rw > 0 && rh > 0) ? 0 : 1; /* 1 → empty, skip */ +} + +/* Solid filled strip (no alpha). cu ctx must be pushed. */ +static int render_strip_solid(AVFilterContext *ctx, AVFrame *out, + int x, int y, int w, int h, + uint8_t Y, uint8_t U, uint8_t V) +{ + CudaGridContext *s = ctx->priv; + CUdeviceptr dst_y, dst_uv; + unsigned short uv; + int ret; + + x &= ~1; y &= ~1; w &= ~1; h &= ~1; + if (w <= 0 || h <= 0) return 0; + + dst_y = (CUdeviceptr)(out->data[0] + (size_t)y * out->linesize[0] + x); + ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuMemsetD2D8Async( + dst_y, out->linesize[0], Y, (size_t)w, (size_t)h, s->cu_stream)); + if (ret < 0) return ret; + + /* NV12 UV interleaved, half res. cuMemsetD2D16Async sets 16-bit element + * = UV pair. Little-endian: low byte=U, high byte=V. */ + uv = (unsigned short)U | ((unsigned short)V << 8); + dst_uv = (CUdeviceptr)(out->data[1] + (size_t)(y / 2) * out->linesize[1] + x); + ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuMemsetD2D16Async( + dst_uv, out->linesize[1], uv, (size_t)(w / 2), (size_t)(h / 2), s->cu_stream)); + return ret; +} + +static int render_overlay_rect(AVFilterContext *ctx, AVFrame *out, const GridOverlay *ov) +{ + CudaGridContext *s = ctx->priv; + int px, py, pw, ph; + uint8_t Y, U, V; + int ret; + + ret = overlay_pixel_rect(s, ov, &px, &py, &pw, &ph); + if (ret != 0) return ret < 0 ? ret : 0; + + rgb_to_yuv709(ov->u.rect.r, ov->u.rect.g, ov->u.rect.b, &Y, &U, &V); + + if (ov->u.rect.thickness <= 0) { + /* Filled */ + return render_strip_solid(ctx, out, px, py, pw, ph, Y, U, V); + } else { + int t = ov->u.rect.thickness; + t = FFMIN(t, FFMIN(pw / 2, ph / 2)); + if (t < 2) t = 2; + t &= ~1; + /* Top */ + ret = render_strip_solid(ctx, out, px, py, pw, t, Y, U, V); + if (ret < 0) return ret; + /* Bottom */ + ret = render_strip_solid(ctx, out, px, py + ph - t, pw, t, Y, U, V); + if (ret < 0) return ret; + /* Left */ + ret = render_strip_solid(ctx, out, px, py + t, t, ph - 2 * t, Y, U, V); + if (ret < 0) return ret; + /* Right */ + ret = render_strip_solid(ctx, out, px + pw - t, py + t, t, ph - 2 * t, Y, U, V); + return ret; + } +} + +/* Render all visible overlays. cu ctx must be pushed by caller. */ +static int render_overlays(AVFilterContext *ctx, AVFrame *out) +{ + CudaGridContext *s = ctx->priv; + GridOverlay sorted[MAX_OVERLAYS]; + int i, j, n, ret; + + pthread_mutex_lock(&s->overlay_lock); + n = s->nb_overlays; + memcpy(sorted, s->overlays, (size_t)n * sizeof(GridOverlay)); + pthread_mutex_unlock(&s->overlay_lock); + + /* Insertion sort by z_order (stable). n ≤ 64 → fine. */ + for (i = 1; i < n; i++) { + GridOverlay tmp = sorted[i]; + for (j = i; j > 0 && sorted[j - 1].z_order > tmp.z_order; j--) + sorted[j] = sorted[j - 1]; + sorted[j] = tmp; + } + + for (i = 0; i < n; i++) { + const GridOverlay *ov = &sorted[i]; + if (!ov->visible) continue; + switch (ov->type) { + case OV_TYPE_RECT: + ret = render_overlay_rect(ctx, out, ov); + if (ret < 0) return ret; + break; + case OV_TYPE_DIM: + av_log(ctx, AV_LOG_TRACE, "overlay %s: dim type — Phase 4b-2 (alpha kernel)\n", ov->id); + break; + case OV_TYPE_TEXT: + av_log(ctx, AV_LOG_TRACE, "overlay %s: text — Phase 4b-3 (freetype)\n", ov->id); + break; + case OV_TYPE_ICON: + av_log(ctx, AV_LOG_TRACE, "overlay %s: icon — Phase 4b-4 (sprite)\n", ov->id); + break; + } + } + return 0; +} + /* ─── Framesync callback ──────────────────────────────────────────────── */ static int cuda_grid_compose(FFFrameSync *fs) @@ -267,6 +579,13 @@ static int cuda_grid_compose(FFFrameSync *fs) } } + /* Overlay pass (Phase 4b) */ + ret = render_overlays(ctx, out); + if (ret < 0) { + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); + goto fail; + } + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); return ff_filter_frame(outlink, out); @@ -305,7 +624,15 @@ static av_cold int cuda_grid_init(AVFilterContext *ctx) return ret; } - av_log(ctx, AV_LOG_INFO, "cuda_grid layout=%s cells=%d output=%dx%d\n", + /* Overlay mutex */ + ret = pthread_mutex_init(&s->overlay_lock, NULL); + if (ret) { + av_log(ctx, AV_LOG_ERROR, "overlay_lock init failed: %d\n", ret); + return AVERROR(ENOMEM); + } + s->overlay_lock_inited = 1; + + av_log(ctx, AV_LOG_INFO, "cuda_grid layout=%s cells=%d output=%dx%d (overlays: rect)\n", lt->name, lt->nb_cells, s->out_width, s->out_height); return 0; } @@ -314,6 +641,58 @@ static av_cold void cuda_grid_uninit(AVFilterContext *ctx) { CudaGridContext *s = ctx->priv; ff_framesync_uninit(&s->fs); + if (s->overlay_lock_inited) { + pthread_mutex_destroy(&s->overlay_lock); + s->overlay_lock_inited = 0; + } +} + +static int cuda_grid_process_command(AVFilterContext *ctx, const char *cmd, + const char *arg, char *res, int res_len, int flags) +{ + CudaGridContext *s = ctx->priv; + int ret; + + if (!strcmp(cmd, "add_overlay")) { + GridOverlay ov; + ret = parse_overlay_args(ctx, arg, &ov); + if (ret < 0) { + if (res) av_strlcpy(res, "err parse", res_len); + return ret; + } + pthread_mutex_lock(&s->overlay_lock); + ret = overlay_upsert_locked(s, &ov); + pthread_mutex_unlock(&s->overlay_lock); + if (res) snprintf(res, res_len, ret == 0 ? "ok id=%s n=%d" : "err full id=%s", + ov.id, s->nb_overlays); + return ret; + } + + if (!strcmp(cmd, "remove_overlay")) { + char id[OVERLAY_ID_MAX]; + if (!arg || sscanf(arg, "%31s", id) != 1) { + if (res) av_strlcpy(res, "err parse", res_len); + return AVERROR(EINVAL); + } + pthread_mutex_lock(&s->overlay_lock); + ret = overlay_remove_locked(s, id); + pthread_mutex_unlock(&s->overlay_lock); + if (res) snprintf(res, res_len, ret == 0 ? "ok id=%s" : "not_found id=%s", id); + return ret; + } + + if (!strcmp(cmd, "clear_overlays")) { + pthread_mutex_lock(&s->overlay_lock); + s->nb_overlays = 0; + pthread_mutex_unlock(&s->overlay_lock); + if (res) av_strlcpy(res, "ok", res_len); + return 0; + } + + /* Future: set_layout (Phase 3 на filter side — пока nb_inputs хардкодится в init) */ + + /* Fall back to standard option/command handling */ + return ff_filter_process_command(ctx, cmd, arg, res, res_len, flags); } static int cuda_grid_config_input(AVFilterLink *inlink) @@ -465,16 +844,17 @@ static const AVFilterPad cuda_grid_outputs[] = { }; const AVFilter ff_vf_cuda_grid = { - .name = "cuda_grid", - .description = NULL_IF_CONFIG_SMALL("GPU-native video grid composer (CUDA)."), - .priv_class = &cuda_grid_class, - .priv_size = sizeof(CudaGridContext), - .init = cuda_grid_init, - .uninit = cuda_grid_uninit, - .activate = cuda_grid_activate, + .name = "cuda_grid", + .description = NULL_IF_CONFIG_SMALL("GPU-native video grid composer (CUDA)."), + .priv_class = &cuda_grid_class, + .priv_size = sizeof(CudaGridContext), + .init = cuda_grid_init, + .uninit = cuda_grid_uninit, + .activate = cuda_grid_activate, + .process_command = cuda_grid_process_command, /* No FILTER_INPUTS — pads added dynamically в init() per layout. */ FILTER_OUTPUTS(cuda_grid_outputs), FILTER_SINGLE_PIXFMT(AV_PIX_FMT_CUDA), - .flags = AVFILTER_FLAG_HWDEVICE | AVFILTER_FLAG_DYNAMIC_INPUTS, - .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE, + .flags = AVFILTER_FLAG_HWDEVICE | AVFILTER_FLAG_DYNAMIC_INPUTS, + .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE, };