vf_cuda_grid: Phase 4b-2 — CUDA kernels (alpha-blended overlays)
Custom .cu kernels компилируются через ffbuild → .ptx, embedded в binary через bin2c pattern (как vf_scale_cuda). Loading через ff_cuda_load_module в config_output, kernel handles cached на life-of-filter. Kernels: Alpha_Fill_Y/UV — solid colour α-blend (rect, dim, border strips) Alpha_Blit_RGBA_Y/UV — blit RGBA atlas → NV12 (text/icon в 4b-3/4) Render side: render_strip_alpha — заменил render_strip_solid (alpha=255 ≡ solid) render_overlay_rect — opacity передаётся в kernel render_overlay_dim — Y=16/UV=neutral × dim.amount (затемнение region) Также: убран unused cuda_grid_config_input (-Werror), добавлены fields CUmodule + 4×CUfunction в Context; unload в uninit. cuMemsetD2D* НЕ доступны в FFmpeg's CudaFunctions wrapper, поэтому solid fill реализован через kernel — лишний overhead minimal (16×16 threads). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -410,7 +410,8 @@ OBJS-$(CONFIG_OSCILLOSCOPE_FILTER) += vf_datascope.o
|
||||
OBJS-$(CONFIG_OVERLAY_FILTER) += vf_overlay.o framesync.o
|
||||
OBJS-$(CONFIG_OVERLAY_CUDA_FILTER) += vf_overlay_cuda.o framesync.o vf_overlay_cuda.ptx.o \
|
||||
cuda/load_helper.o
|
||||
OBJS-$(CONFIG_CUDA_GRID_FILTER) += vf_cuda_grid.o framesync.o
|
||||
OBJS-$(CONFIG_CUDA_GRID_FILTER) += vf_cuda_grid.o framesync.o \
|
||||
vf_cuda_grid.ptx.o cuda/load_helper.o
|
||||
OBJS-$(CONFIG_OVERLAY_OPENCL_FILTER) += vf_overlay_opencl.o opencl.o \
|
||||
opencl/overlay.o framesync.o
|
||||
OBJS-$(CONFIG_OVERLAY_QSV_FILTER) += vf_overlay_qsv.o framesync.o
|
||||
|
||||
+107
-36
@@ -43,11 +43,19 @@
|
||||
#include "libavutil/thread.h"
|
||||
|
||||
#include "avfilter.h"
|
||||
#include "cuda/load_helper.h"
|
||||
#include "filters.h"
|
||||
#include "formats.h"
|
||||
#include "framesync.h"
|
||||
#include "video.h"
|
||||
|
||||
extern const unsigned char ff_vf_cuda_grid_ptx_data[];
|
||||
extern const unsigned int ff_vf_cuda_grid_ptx_len;
|
||||
|
||||
#define BLOCKX 16
|
||||
#define BLOCKY 16
|
||||
#define DIV_UP(a, b) (((a) + (b) - 1) / (b))
|
||||
|
||||
#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x)
|
||||
#define MAX_CELLS 16
|
||||
#define MAX_OVERLAYS 64
|
||||
@@ -203,6 +211,13 @@ typedef struct CudaGridContext {
|
||||
int nb_overlays;
|
||||
pthread_mutex_t overlay_lock;
|
||||
int overlay_lock_inited;
|
||||
|
||||
/* CUDA kernels (Phase 4b-2) — loaded from embedded .ptx */
|
||||
CUmodule cu_module;
|
||||
CUfunction cu_func_alpha_fill_y;
|
||||
CUfunction cu_func_alpha_fill_uv;
|
||||
CUfunction cu_func_alpha_blit_rgba_y;
|
||||
CUfunction cu_func_alpha_blit_rgba_uv;
|
||||
} CudaGridContext;
|
||||
|
||||
/* ─── Composition: copy одного input plane в target region output ──────── */
|
||||
@@ -396,31 +411,49 @@ static int overlay_pixel_rect(CudaGridContext *s, const GridOverlay *ov,
|
||||
return (rw > 0 && rh > 0) ? 0 : 1; /* 1 → empty, skip */
|
||||
}
|
||||
|
||||
/* Solid filled strip (no alpha). cu ctx must be pushed. */
|
||||
static int render_strip_solid(AVFilterContext *ctx, AVFrame *out,
|
||||
/* Strip α-blend (opacity 0..255). cu ctx must be pushed.
|
||||
* opacity=255 → solid fill (kernel uses single-pass blend formula). */
|
||||
static int render_strip_alpha(AVFilterContext *ctx, AVFrame *out,
|
||||
int x, int y, int w, int h,
|
||||
uint8_t Y, uint8_t U, uint8_t V)
|
||||
uint8_t Y, uint8_t U, uint8_t V, uint8_t alpha)
|
||||
{
|
||||
CudaGridContext *s = ctx->priv;
|
||||
CUdeviceptr dst_y, dst_uv;
|
||||
unsigned short uv;
|
||||
int ret;
|
||||
int iY = Y, iU = U, iV = V, iA = alpha;
|
||||
int rx, ry, rw, rh;
|
||||
|
||||
x &= ~1; y &= ~1; w &= ~1; h &= ~1;
|
||||
if (w <= 0 || h <= 0) return 0;
|
||||
if (w <= 0 || h <= 0 || alpha == 0) return 0;
|
||||
|
||||
dst_y = (CUdeviceptr)(out->data[0] + (size_t)y * out->linesize[0] + x);
|
||||
ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuMemsetD2D8Async(
|
||||
dst_y, out->linesize[0], Y, (size_t)w, (size_t)h, s->cu_stream));
|
||||
if (ret < 0) return ret;
|
||||
/* Y plane */
|
||||
dst_y = (CUdeviceptr)out->data[0];
|
||||
rx = x; ry = y; rw = w; rh = h;
|
||||
{
|
||||
int dst_pitch_y = out->linesize[0];
|
||||
void *args[] = { &dst_y, &dst_pitch_y, &rx, &ry, &rw, &rh, &iY, &iA };
|
||||
ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuLaunchKernel(
|
||||
s->cu_func_alpha_fill_y,
|
||||
DIV_UP(w, BLOCKX), DIV_UP(h, BLOCKY), 1,
|
||||
BLOCKX, BLOCKY, 1,
|
||||
0, s->cu_stream, args, NULL));
|
||||
if (ret < 0) return ret;
|
||||
}
|
||||
|
||||
/* NV12 UV interleaved, half res. cuMemsetD2D16Async sets 16-bit element
|
||||
* = UV pair. Little-endian: low byte=U, high byte=V. */
|
||||
uv = (unsigned short)U | ((unsigned short)V << 8);
|
||||
dst_uv = (CUdeviceptr)(out->data[1] + (size_t)(y / 2) * out->linesize[1] + x);
|
||||
ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuMemsetD2D16Async(
|
||||
dst_uv, out->linesize[1], uv, (size_t)(w / 2), (size_t)(h / 2), s->cu_stream));
|
||||
return ret;
|
||||
/* UV plane (half res, interleaved 2 bytes per chroma sample) */
|
||||
dst_uv = (CUdeviceptr)out->data[1];
|
||||
rx = x / 2; ry = y / 2; rw = w / 2; rh = h / 2;
|
||||
{
|
||||
int dst_pitch_uv = out->linesize[1];
|
||||
void *args[] = { &dst_uv, &dst_pitch_uv, &rx, &ry, &rw, &rh, &iU, &iV, &iA };
|
||||
ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuLaunchKernel(
|
||||
s->cu_func_alpha_fill_uv,
|
||||
DIV_UP(w / 2, BLOCKX), DIV_UP(h / 2, BLOCKY), 1,
|
||||
BLOCKX, BLOCKY, 1,
|
||||
0, s->cu_stream, args, NULL));
|
||||
if (ret < 0) return ret;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int render_overlay_rect(AVFilterContext *ctx, AVFrame *out, const GridOverlay *ov)
|
||||
@@ -436,28 +469,42 @@ static int render_overlay_rect(AVFilterContext *ctx, AVFrame *out, const GridOve
|
||||
rgb_to_yuv709(ov->u.rect.r, ov->u.rect.g, ov->u.rect.b, &Y, &U, &V);
|
||||
|
||||
if (ov->u.rect.thickness <= 0) {
|
||||
/* Filled */
|
||||
return render_strip_solid(ctx, out, px, py, pw, ph, Y, U, V);
|
||||
/* Filled rect */
|
||||
return render_strip_alpha(ctx, out, px, py, pw, ph, Y, U, V, ov->opacity);
|
||||
} else {
|
||||
int t = ov->u.rect.thickness;
|
||||
t = FFMIN(t, FFMIN(pw / 2, ph / 2));
|
||||
if (t < 2) t = 2;
|
||||
t &= ~1;
|
||||
/* Top */
|
||||
ret = render_strip_solid(ctx, out, px, py, pw, t, Y, U, V);
|
||||
ret = render_strip_alpha(ctx, out, px, py, pw, t, Y, U, V, ov->opacity);
|
||||
if (ret < 0) return ret;
|
||||
/* Bottom */
|
||||
ret = render_strip_solid(ctx, out, px, py + ph - t, pw, t, Y, U, V);
|
||||
ret = render_strip_alpha(ctx, out, px, py + ph - t, pw, t, Y, U, V, ov->opacity);
|
||||
if (ret < 0) return ret;
|
||||
/* Left */
|
||||
ret = render_strip_solid(ctx, out, px, py + t, t, ph - 2 * t, Y, U, V);
|
||||
ret = render_strip_alpha(ctx, out, px, py + t, t, ph - 2 * t, Y, U, V, ov->opacity);
|
||||
if (ret < 0) return ret;
|
||||
/* Right */
|
||||
ret = render_strip_solid(ctx, out, px + pw - t, py + t, t, ph - 2 * t, Y, U, V);
|
||||
ret = render_strip_alpha(ctx, out, px + pw - t, py + t, t, ph - 2 * t, Y, U, V, ov->opacity);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
/* Dim overlay: рамка тёмная (Y=16, neutral UV) полупрозрачная по cell. */
|
||||
static int render_overlay_dim(AVFilterContext *ctx, AVFrame *out, const GridOverlay *ov)
|
||||
{
|
||||
CudaGridContext *s = ctx->priv;
|
||||
int px, py, pw, ph;
|
||||
int ret;
|
||||
|
||||
ret = overlay_pixel_rect(s, ov, &px, &py, &pw, &ph);
|
||||
if (ret != 0) return ret < 0 ? ret : 0;
|
||||
|
||||
/* Y=16 = darkest legal Y (BT.709 limited range), UV=128 = neutral chroma */
|
||||
return render_strip_alpha(ctx, out, px, py, pw, ph, 16, 128, 128, ov->u.dim.amount);
|
||||
}
|
||||
|
||||
/* Render all visible overlays. cu ctx must be pushed by caller. */
|
||||
static int render_overlays(AVFilterContext *ctx, AVFrame *out)
|
||||
{
|
||||
@@ -487,7 +534,8 @@ static int render_overlays(AVFilterContext *ctx, AVFrame *out)
|
||||
if (ret < 0) return ret;
|
||||
break;
|
||||
case OV_TYPE_DIM:
|
||||
av_log(ctx, AV_LOG_TRACE, "overlay %s: dim type — Phase 4b-2 (alpha kernel)\n", ov->id);
|
||||
ret = render_overlay_dim(ctx, out, ov);
|
||||
if (ret < 0) return ret;
|
||||
break;
|
||||
case OV_TYPE_TEXT:
|
||||
av_log(ctx, AV_LOG_TRACE, "overlay %s: text — Phase 4b-3 (freetype)\n", ov->id);
|
||||
@@ -641,6 +689,13 @@ static av_cold void cuda_grid_uninit(AVFilterContext *ctx)
|
||||
{
|
||||
CudaGridContext *s = ctx->priv;
|
||||
ff_framesync_uninit(&s->fs);
|
||||
if (s->cu_module && s->hwctx) {
|
||||
CUcontext dummy;
|
||||
CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPushCurrent(s->cu_ctx));
|
||||
CHECK_CU(s->hwctx->internal->cuda_dl->cuModuleUnload(s->cu_module));
|
||||
CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
|
||||
s->cu_module = NULL;
|
||||
}
|
||||
if (s->overlay_lock_inited) {
|
||||
pthread_mutex_destroy(&s->overlay_lock);
|
||||
s->overlay_lock_inited = 0;
|
||||
@@ -695,19 +750,6 @@ static int cuda_grid_process_command(AVFilterContext *ctx, const char *cmd,
|
||||
return ff_filter_process_command(ctx, cmd, arg, res, res_len, flags);
|
||||
}
|
||||
|
||||
static int cuda_grid_config_input(AVFilterLink *inlink)
|
||||
{
|
||||
AVFilterContext *ctx = inlink->src;
|
||||
FilterLink *inl = ff_filter_link(inlink);
|
||||
|
||||
if (!inl->hw_frames_ctx || !inl->hw_frames_ctx->data) {
|
||||
av_log(ctx, AV_LOG_ERROR, "input %s: software pixel format не поддерживается\n",
|
||||
inlink->dstpad->name);
|
||||
return AVERROR(EINVAL);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int cuda_grid_config_output(AVFilterLink *outlink)
|
||||
{
|
||||
AVFilterContext *ctx = outlink->src;
|
||||
@@ -773,6 +815,35 @@ static int cuda_grid_config_output(AVFilterLink *outlink)
|
||||
s->cu_ctx = s->hwctx->cuda_ctx;
|
||||
s->cu_stream = s->hwctx->stream;
|
||||
|
||||
/* Load CUDA module + resolve kernel handles (Phase 4b-2 alpha kernels) */
|
||||
{
|
||||
CUcontext dummy;
|
||||
ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPushCurrent(s->cu_ctx));
|
||||
if (ret < 0) return ret;
|
||||
|
||||
ret = ff_cuda_load_module(ctx, s->hwctx, &s->cu_module,
|
||||
ff_vf_cuda_grid_ptx_data, ff_vf_cuda_grid_ptx_len);
|
||||
if (ret < 0) {
|
||||
CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
|
||||
return ret;
|
||||
}
|
||||
|
||||
#define GET_FN(handle, name) \
|
||||
ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuModuleGetFunction( \
|
||||
&s->handle, s->cu_module, name)); \
|
||||
if (ret < 0) { \
|
||||
CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); \
|
||||
return ret; \
|
||||
}
|
||||
GET_FN(cu_func_alpha_fill_y, "Alpha_Fill_Y");
|
||||
GET_FN(cu_func_alpha_fill_uv, "Alpha_Fill_UV");
|
||||
GET_FN(cu_func_alpha_blit_rgba_y, "Alpha_Blit_RGBA_Y");
|
||||
GET_FN(cu_func_alpha_blit_rgba_uv, "Alpha_Blit_RGBA_UV");
|
||||
#undef GET_FN
|
||||
|
||||
CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
|
||||
}
|
||||
|
||||
out_ref = av_hwframe_ctx_alloc(hwfc0->device_ref);
|
||||
if (!out_ref)
|
||||
return AVERROR(ENOMEM);
|
||||
|
||||
@@ -0,0 +1,121 @@
|
||||
/*
|
||||
* cuda_grid overlay CUDA kernels.
|
||||
*
|
||||
* Алфа-блендинг poверх NV12 frame:
|
||||
* - Alpha_Fill_Y / Alpha_Fill_UV: solid colour fill region (rect/dim)
|
||||
* - Alpha_Blit_RGBA_Y / Alpha_Blit_RGBA_UV: blit RGBA atlas → NV12
|
||||
* (для text Phase 4b-3 и icon Phase 4b-4)
|
||||
*
|
||||
* BT.709 limited-range conversion (HDTV). См. также vf_cuda_grid.c rgb_to_yuv709.
|
||||
*
|
||||
* Лицензия: LGPL-2.1+ (соответствует FFmpeg)
|
||||
*/
|
||||
|
||||
#include "cuda/vector_helpers.cuh"
|
||||
|
||||
extern "C" {
|
||||
|
||||
/* Solid colour α-blend на Y plane.
|
||||
* dst: pointer на Y plane base
|
||||
* dst_pitch: bytes per row
|
||||
* rx, ry, rw, rh: region rect (pixels, must be in-bounds)
|
||||
* fill: 0..255 (Y component to blend in)
|
||||
* alpha: 0..255 (255 = fully opaque)
|
||||
*/
|
||||
__global__ void Alpha_Fill_Y(unsigned char *dst, int dst_pitch,
|
||||
int rx, int ry, int rw, int rh,
|
||||
int fill, int alpha)
|
||||
{
|
||||
int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
if (x >= rw || y >= rh) return;
|
||||
unsigned char *p = dst + (ry + y) * dst_pitch + (rx + x);
|
||||
int cur = *p;
|
||||
*p = (unsigned char)((fill * alpha + cur * (255 - alpha)) / 255);
|
||||
}
|
||||
|
||||
/* Solid colour α-blend на UV plane (NV12 interleaved).
|
||||
* rx, ry: in chroma plane coords (= full-res Y x/2, y/2)
|
||||
* rw, rh: also в chroma coords
|
||||
*/
|
||||
__global__ void Alpha_Fill_UV(unsigned char *dst, int dst_pitch,
|
||||
int rx, int ry, int rw, int rh,
|
||||
int fill_u, int fill_v, int alpha)
|
||||
{
|
||||
int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
if (x >= rw || y >= rh) return;
|
||||
unsigned char *p = dst + (ry + y) * dst_pitch + (rx + x) * 2;
|
||||
int cu = p[0], cv = p[1];
|
||||
p[0] = (unsigned char)((fill_u * alpha + cu * (255 - alpha)) / 255);
|
||||
p[1] = (unsigned char)((fill_v * alpha + cv * (255 - alpha)) / 255);
|
||||
}
|
||||
|
||||
/* Blit RGBA atlas → Y plane с α-blending.
|
||||
* dst, dst_pitch: Y plane
|
||||
* dx, dy: destination pixel offset (full-res coords)
|
||||
* atlas, atlas_pitch: RGBA source (interleaved R,G,B,A bytes), pitch в bytes
|
||||
* w, h: atlas dimensions (in pixels)
|
||||
* extra_alpha: дополнительный множитель (0..255) — overlay-level opacity
|
||||
*/
|
||||
__global__ void Alpha_Blit_RGBA_Y(unsigned char *dst, int dst_pitch,
|
||||
int dx, int dy,
|
||||
const unsigned char *atlas, int atlas_pitch,
|
||||
int w, int h, int extra_alpha)
|
||||
{
|
||||
int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
if (x >= w || y >= h) return;
|
||||
const unsigned char *sp = atlas + y * atlas_pitch + x * 4;
|
||||
int r = sp[0], g = sp[1], b = sp[2], a = sp[3];
|
||||
a = a * extra_alpha / 255;
|
||||
if (a == 0) return;
|
||||
int Y = (int)(0.183f * r + 0.614f * g + 0.062f * b) + 16;
|
||||
Y = Y < 0 ? 0 : (Y > 255 ? 255 : Y);
|
||||
unsigned char *p = dst + (dy + y) * dst_pitch + (dx + x);
|
||||
int cur = *p;
|
||||
*p = (unsigned char)((Y * a + cur * (255 - a)) / 255);
|
||||
}
|
||||
|
||||
/* Blit RGBA atlas → UV plane с 4:2:0 chroma subsampling.
|
||||
* dst, dst_pitch: UV plane
|
||||
* dx, dy: destination pixel offset (full-res coords; обе должны быть кратны 2)
|
||||
* atlas, atlas_pitch: RGBA source
|
||||
* w, h: atlas dimensions (full-res; UV operates на w/2 × h/2)
|
||||
* extra_alpha: 0..255
|
||||
*/
|
||||
__global__ void Alpha_Blit_RGBA_UV(unsigned char *dst, int dst_pitch,
|
||||
int dx, int dy,
|
||||
const unsigned char *atlas, int atlas_pitch,
|
||||
int w, int h, int extra_alpha)
|
||||
{
|
||||
int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
int hw = w / 2, hh = h / 2;
|
||||
if (x >= hw || y >= hh) return;
|
||||
|
||||
int sx = x * 2, sy = y * 2;
|
||||
const unsigned char *row0 = atlas + sy * atlas_pitch;
|
||||
const unsigned char *row1 = atlas + (sy + 1) * atlas_pitch;
|
||||
|
||||
int r = (row0[sx*4+0] + row0[(sx+1)*4+0] + row1[sx*4+0] + row1[(sx+1)*4+0]) >> 2;
|
||||
int g = (row0[sx*4+1] + row0[(sx+1)*4+1] + row1[sx*4+1] + row1[(sx+1)*4+1]) >> 2;
|
||||
int b = (row0[sx*4+2] + row0[(sx+1)*4+2] + row1[sx*4+2] + row1[(sx+1)*4+2]) >> 2;
|
||||
int a = (row0[sx*4+3] + row0[(sx+1)*4+3] + row1[sx*4+3] + row1[(sx+1)*4+3]) >> 2;
|
||||
a = a * extra_alpha / 255;
|
||||
if (a == 0) return;
|
||||
|
||||
int U = (int)(-0.101f * r - 0.339f * g + 0.439f * b) + 128;
|
||||
int V = (int)( 0.439f * r - 0.399f * g - 0.040f * b) + 128;
|
||||
U = U < 0 ? 0 : (U > 255 ? 255 : U);
|
||||
V = V < 0 ? 0 : (V > 255 ? 255 : V);
|
||||
|
||||
int du = dx / 2 + x;
|
||||
int dv = dy / 2 + y;
|
||||
unsigned char *p = dst + dv * dst_pitch + du * 2;
|
||||
int cu = p[0], cv = p[1];
|
||||
p[0] = (unsigned char)((U * a + cu * (255 - a)) / 255);
|
||||
p[1] = (unsigned char)((V * a + cv * (255 - a)) / 255);
|
||||
}
|
||||
|
||||
} /* extern "C" */
|
||||
Reference in New Issue
Block a user