diff --git a/libavfilter/Makefile b/libavfilter/Makefile index 8015a49..f570821 100644 --- a/libavfilter/Makefile +++ b/libavfilter/Makefile @@ -410,7 +410,8 @@ OBJS-$(CONFIG_OSCILLOSCOPE_FILTER) += vf_datascope.o OBJS-$(CONFIG_OVERLAY_FILTER) += vf_overlay.o framesync.o OBJS-$(CONFIG_OVERLAY_CUDA_FILTER) += vf_overlay_cuda.o framesync.o vf_overlay_cuda.ptx.o \ cuda/load_helper.o -OBJS-$(CONFIG_CUDA_GRID_FILTER) += vf_cuda_grid.o framesync.o +OBJS-$(CONFIG_CUDA_GRID_FILTER) += vf_cuda_grid.o framesync.o \ + vf_cuda_grid.ptx.o cuda/load_helper.o OBJS-$(CONFIG_OVERLAY_OPENCL_FILTER) += vf_overlay_opencl.o opencl.o \ opencl/overlay.o framesync.o OBJS-$(CONFIG_OVERLAY_QSV_FILTER) += vf_overlay_qsv.o framesync.o diff --git a/libavfilter/vf_cuda_grid.c b/libavfilter/vf_cuda_grid.c index 9f5cc51..675fb15 100644 --- a/libavfilter/vf_cuda_grid.c +++ b/libavfilter/vf_cuda_grid.c @@ -43,11 +43,19 @@ #include "libavutil/thread.h" #include "avfilter.h" +#include "cuda/load_helper.h" #include "filters.h" #include "formats.h" #include "framesync.h" #include "video.h" +extern const unsigned char ff_vf_cuda_grid_ptx_data[]; +extern const unsigned int ff_vf_cuda_grid_ptx_len; + +#define BLOCKX 16 +#define BLOCKY 16 +#define DIV_UP(a, b) (((a) + (b) - 1) / (b)) + #define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x) #define MAX_CELLS 16 #define MAX_OVERLAYS 64 @@ -203,6 +211,13 @@ typedef struct CudaGridContext { int nb_overlays; pthread_mutex_t overlay_lock; int overlay_lock_inited; + + /* CUDA kernels (Phase 4b-2) — loaded from embedded .ptx */ + CUmodule cu_module; + CUfunction cu_func_alpha_fill_y; + CUfunction cu_func_alpha_fill_uv; + CUfunction cu_func_alpha_blit_rgba_y; + CUfunction cu_func_alpha_blit_rgba_uv; } CudaGridContext; /* ─── Composition: copy одного input plane в target region output ──────── */ @@ -396,31 +411,49 @@ static int overlay_pixel_rect(CudaGridContext *s, const GridOverlay *ov, return (rw > 0 && rh > 0) ? 0 : 1; /* 1 → empty, skip */ } -/* Solid filled strip (no alpha). cu ctx must be pushed. */ -static int render_strip_solid(AVFilterContext *ctx, AVFrame *out, +/* Strip α-blend (opacity 0..255). cu ctx must be pushed. + * opacity=255 → solid fill (kernel uses single-pass blend formula). */ +static int render_strip_alpha(AVFilterContext *ctx, AVFrame *out, int x, int y, int w, int h, - uint8_t Y, uint8_t U, uint8_t V) + uint8_t Y, uint8_t U, uint8_t V, uint8_t alpha) { CudaGridContext *s = ctx->priv; CUdeviceptr dst_y, dst_uv; - unsigned short uv; int ret; + int iY = Y, iU = U, iV = V, iA = alpha; + int rx, ry, rw, rh; x &= ~1; y &= ~1; w &= ~1; h &= ~1; - if (w <= 0 || h <= 0) return 0; + if (w <= 0 || h <= 0 || alpha == 0) return 0; - dst_y = (CUdeviceptr)(out->data[0] + (size_t)y * out->linesize[0] + x); - ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuMemsetD2D8Async( - dst_y, out->linesize[0], Y, (size_t)w, (size_t)h, s->cu_stream)); - if (ret < 0) return ret; + /* Y plane */ + dst_y = (CUdeviceptr)out->data[0]; + rx = x; ry = y; rw = w; rh = h; + { + int dst_pitch_y = out->linesize[0]; + void *args[] = { &dst_y, &dst_pitch_y, &rx, &ry, &rw, &rh, &iY, &iA }; + ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuLaunchKernel( + s->cu_func_alpha_fill_y, + DIV_UP(w, BLOCKX), DIV_UP(h, BLOCKY), 1, + BLOCKX, BLOCKY, 1, + 0, s->cu_stream, args, NULL)); + if (ret < 0) return ret; + } - /* NV12 UV interleaved, half res. cuMemsetD2D16Async sets 16-bit element - * = UV pair. Little-endian: low byte=U, high byte=V. */ - uv = (unsigned short)U | ((unsigned short)V << 8); - dst_uv = (CUdeviceptr)(out->data[1] + (size_t)(y / 2) * out->linesize[1] + x); - ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuMemsetD2D16Async( - dst_uv, out->linesize[1], uv, (size_t)(w / 2), (size_t)(h / 2), s->cu_stream)); - return ret; + /* UV plane (half res, interleaved 2 bytes per chroma sample) */ + dst_uv = (CUdeviceptr)out->data[1]; + rx = x / 2; ry = y / 2; rw = w / 2; rh = h / 2; + { + int dst_pitch_uv = out->linesize[1]; + void *args[] = { &dst_uv, &dst_pitch_uv, &rx, &ry, &rw, &rh, &iU, &iV, &iA }; + ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuLaunchKernel( + s->cu_func_alpha_fill_uv, + DIV_UP(w / 2, BLOCKX), DIV_UP(h / 2, BLOCKY), 1, + BLOCKX, BLOCKY, 1, + 0, s->cu_stream, args, NULL)); + if (ret < 0) return ret; + } + return 0; } static int render_overlay_rect(AVFilterContext *ctx, AVFrame *out, const GridOverlay *ov) @@ -436,28 +469,42 @@ static int render_overlay_rect(AVFilterContext *ctx, AVFrame *out, const GridOve rgb_to_yuv709(ov->u.rect.r, ov->u.rect.g, ov->u.rect.b, &Y, &U, &V); if (ov->u.rect.thickness <= 0) { - /* Filled */ - return render_strip_solid(ctx, out, px, py, pw, ph, Y, U, V); + /* Filled rect */ + return render_strip_alpha(ctx, out, px, py, pw, ph, Y, U, V, ov->opacity); } else { int t = ov->u.rect.thickness; t = FFMIN(t, FFMIN(pw / 2, ph / 2)); if (t < 2) t = 2; t &= ~1; /* Top */ - ret = render_strip_solid(ctx, out, px, py, pw, t, Y, U, V); + ret = render_strip_alpha(ctx, out, px, py, pw, t, Y, U, V, ov->opacity); if (ret < 0) return ret; /* Bottom */ - ret = render_strip_solid(ctx, out, px, py + ph - t, pw, t, Y, U, V); + ret = render_strip_alpha(ctx, out, px, py + ph - t, pw, t, Y, U, V, ov->opacity); if (ret < 0) return ret; /* Left */ - ret = render_strip_solid(ctx, out, px, py + t, t, ph - 2 * t, Y, U, V); + ret = render_strip_alpha(ctx, out, px, py + t, t, ph - 2 * t, Y, U, V, ov->opacity); if (ret < 0) return ret; /* Right */ - ret = render_strip_solid(ctx, out, px + pw - t, py + t, t, ph - 2 * t, Y, U, V); + ret = render_strip_alpha(ctx, out, px + pw - t, py + t, t, ph - 2 * t, Y, U, V, ov->opacity); return ret; } } +/* Dim overlay: рамка тёмная (Y=16, neutral UV) полупрозрачная по cell. */ +static int render_overlay_dim(AVFilterContext *ctx, AVFrame *out, const GridOverlay *ov) +{ + CudaGridContext *s = ctx->priv; + int px, py, pw, ph; + int ret; + + ret = overlay_pixel_rect(s, ov, &px, &py, &pw, &ph); + if (ret != 0) return ret < 0 ? ret : 0; + + /* Y=16 = darkest legal Y (BT.709 limited range), UV=128 = neutral chroma */ + return render_strip_alpha(ctx, out, px, py, pw, ph, 16, 128, 128, ov->u.dim.amount); +} + /* Render all visible overlays. cu ctx must be pushed by caller. */ static int render_overlays(AVFilterContext *ctx, AVFrame *out) { @@ -487,7 +534,8 @@ static int render_overlays(AVFilterContext *ctx, AVFrame *out) if (ret < 0) return ret; break; case OV_TYPE_DIM: - av_log(ctx, AV_LOG_TRACE, "overlay %s: dim type — Phase 4b-2 (alpha kernel)\n", ov->id); + ret = render_overlay_dim(ctx, out, ov); + if (ret < 0) return ret; break; case OV_TYPE_TEXT: av_log(ctx, AV_LOG_TRACE, "overlay %s: text — Phase 4b-3 (freetype)\n", ov->id); @@ -641,6 +689,13 @@ static av_cold void cuda_grid_uninit(AVFilterContext *ctx) { CudaGridContext *s = ctx->priv; ff_framesync_uninit(&s->fs); + if (s->cu_module && s->hwctx) { + CUcontext dummy; + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPushCurrent(s->cu_ctx)); + CHECK_CU(s->hwctx->internal->cuda_dl->cuModuleUnload(s->cu_module)); + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); + s->cu_module = NULL; + } if (s->overlay_lock_inited) { pthread_mutex_destroy(&s->overlay_lock); s->overlay_lock_inited = 0; @@ -695,19 +750,6 @@ static int cuda_grid_process_command(AVFilterContext *ctx, const char *cmd, return ff_filter_process_command(ctx, cmd, arg, res, res_len, flags); } -static int cuda_grid_config_input(AVFilterLink *inlink) -{ - AVFilterContext *ctx = inlink->src; - FilterLink *inl = ff_filter_link(inlink); - - if (!inl->hw_frames_ctx || !inl->hw_frames_ctx->data) { - av_log(ctx, AV_LOG_ERROR, "input %s: software pixel format не поддерживается\n", - inlink->dstpad->name); - return AVERROR(EINVAL); - } - return 0; -} - static int cuda_grid_config_output(AVFilterLink *outlink) { AVFilterContext *ctx = outlink->src; @@ -773,6 +815,35 @@ static int cuda_grid_config_output(AVFilterLink *outlink) s->cu_ctx = s->hwctx->cuda_ctx; s->cu_stream = s->hwctx->stream; + /* Load CUDA module + resolve kernel handles (Phase 4b-2 alpha kernels) */ + { + CUcontext dummy; + ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPushCurrent(s->cu_ctx)); + if (ret < 0) return ret; + + ret = ff_cuda_load_module(ctx, s->hwctx, &s->cu_module, + ff_vf_cuda_grid_ptx_data, ff_vf_cuda_grid_ptx_len); + if (ret < 0) { + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); + return ret; + } + +#define GET_FN(handle, name) \ + ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuModuleGetFunction( \ + &s->handle, s->cu_module, name)); \ + if (ret < 0) { \ + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); \ + return ret; \ + } + GET_FN(cu_func_alpha_fill_y, "Alpha_Fill_Y"); + GET_FN(cu_func_alpha_fill_uv, "Alpha_Fill_UV"); + GET_FN(cu_func_alpha_blit_rgba_y, "Alpha_Blit_RGBA_Y"); + GET_FN(cu_func_alpha_blit_rgba_uv, "Alpha_Blit_RGBA_UV"); +#undef GET_FN + + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); + } + out_ref = av_hwframe_ctx_alloc(hwfc0->device_ref); if (!out_ref) return AVERROR(ENOMEM); diff --git a/libavfilter/vf_cuda_grid.cu b/libavfilter/vf_cuda_grid.cu new file mode 100644 index 0000000..10d2af6 --- /dev/null +++ b/libavfilter/vf_cuda_grid.cu @@ -0,0 +1,121 @@ +/* + * cuda_grid overlay CUDA kernels. + * + * Алфа-блендинг poверх NV12 frame: + * - Alpha_Fill_Y / Alpha_Fill_UV: solid colour fill region (rect/dim) + * - Alpha_Blit_RGBA_Y / Alpha_Blit_RGBA_UV: blit RGBA atlas → NV12 + * (для text Phase 4b-3 и icon Phase 4b-4) + * + * BT.709 limited-range conversion (HDTV). См. также vf_cuda_grid.c rgb_to_yuv709. + * + * Лицензия: LGPL-2.1+ (соответствует FFmpeg) + */ + +#include "cuda/vector_helpers.cuh" + +extern "C" { + +/* Solid colour α-blend на Y plane. + * dst: pointer на Y plane base + * dst_pitch: bytes per row + * rx, ry, rw, rh: region rect (pixels, must be in-bounds) + * fill: 0..255 (Y component to blend in) + * alpha: 0..255 (255 = fully opaque) + */ +__global__ void Alpha_Fill_Y(unsigned char *dst, int dst_pitch, + int rx, int ry, int rw, int rh, + int fill, int alpha) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= rw || y >= rh) return; + unsigned char *p = dst + (ry + y) * dst_pitch + (rx + x); + int cur = *p; + *p = (unsigned char)((fill * alpha + cur * (255 - alpha)) / 255); +} + +/* Solid colour α-blend на UV plane (NV12 interleaved). + * rx, ry: in chroma plane coords (= full-res Y x/2, y/2) + * rw, rh: also в chroma coords + */ +__global__ void Alpha_Fill_UV(unsigned char *dst, int dst_pitch, + int rx, int ry, int rw, int rh, + int fill_u, int fill_v, int alpha) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= rw || y >= rh) return; + unsigned char *p = dst + (ry + y) * dst_pitch + (rx + x) * 2; + int cu = p[0], cv = p[1]; + p[0] = (unsigned char)((fill_u * alpha + cu * (255 - alpha)) / 255); + p[1] = (unsigned char)((fill_v * alpha + cv * (255 - alpha)) / 255); +} + +/* Blit RGBA atlas → Y plane с α-blending. + * dst, dst_pitch: Y plane + * dx, dy: destination pixel offset (full-res coords) + * atlas, atlas_pitch: RGBA source (interleaved R,G,B,A bytes), pitch в bytes + * w, h: atlas dimensions (in pixels) + * extra_alpha: дополнительный множитель (0..255) — overlay-level opacity + */ +__global__ void Alpha_Blit_RGBA_Y(unsigned char *dst, int dst_pitch, + int dx, int dy, + const unsigned char *atlas, int atlas_pitch, + int w, int h, int extra_alpha) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= w || y >= h) return; + const unsigned char *sp = atlas + y * atlas_pitch + x * 4; + int r = sp[0], g = sp[1], b = sp[2], a = sp[3]; + a = a * extra_alpha / 255; + if (a == 0) return; + int Y = (int)(0.183f * r + 0.614f * g + 0.062f * b) + 16; + Y = Y < 0 ? 0 : (Y > 255 ? 255 : Y); + unsigned char *p = dst + (dy + y) * dst_pitch + (dx + x); + int cur = *p; + *p = (unsigned char)((Y * a + cur * (255 - a)) / 255); +} + +/* Blit RGBA atlas → UV plane с 4:2:0 chroma subsampling. + * dst, dst_pitch: UV plane + * dx, dy: destination pixel offset (full-res coords; обе должны быть кратны 2) + * atlas, atlas_pitch: RGBA source + * w, h: atlas dimensions (full-res; UV operates на w/2 × h/2) + * extra_alpha: 0..255 + */ +__global__ void Alpha_Blit_RGBA_UV(unsigned char *dst, int dst_pitch, + int dx, int dy, + const unsigned char *atlas, int atlas_pitch, + int w, int h, int extra_alpha) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + int hw = w / 2, hh = h / 2; + if (x >= hw || y >= hh) return; + + int sx = x * 2, sy = y * 2; + const unsigned char *row0 = atlas + sy * atlas_pitch; + const unsigned char *row1 = atlas + (sy + 1) * atlas_pitch; + + int r = (row0[sx*4+0] + row0[(sx+1)*4+0] + row1[sx*4+0] + row1[(sx+1)*4+0]) >> 2; + int g = (row0[sx*4+1] + row0[(sx+1)*4+1] + row1[sx*4+1] + row1[(sx+1)*4+1]) >> 2; + int b = (row0[sx*4+2] + row0[(sx+1)*4+2] + row1[sx*4+2] + row1[(sx+1)*4+2]) >> 2; + int a = (row0[sx*4+3] + row0[(sx+1)*4+3] + row1[sx*4+3] + row1[(sx+1)*4+3]) >> 2; + a = a * extra_alpha / 255; + if (a == 0) return; + + int U = (int)(-0.101f * r - 0.339f * g + 0.439f * b) + 128; + int V = (int)( 0.439f * r - 0.399f * g - 0.040f * b) + 128; + U = U < 0 ? 0 : (U > 255 ? 255 : U); + V = V < 0 ? 0 : (V > 255 ? 255 : V); + + int du = dx / 2 + x; + int dv = dy / 2 + y; + unsigned char *p = dst + dv * dst_pitch + du * 2; + int cu = p[0], cv = p[1]; + p[0] = (unsigned char)((U * a + cu * (255 - a)) / 255); + p[1] = (unsigned char)((V * a + cv * (255 - a)) / 255); +} + +} /* extern "C" */