WIP: libavfilter: vf_cuda_grid — Phase 1 MVP (fixed quad layout) #2

Draft
gx wants to merge 14 commits from n7.1-vf-cuda-grid into n7.1-cuframes
5 changed files with 1805 additions and 0 deletions
Vendored
+2
View File
@@ -3317,6 +3317,8 @@ thumbnail_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
transpose_npp_filter_deps="ffnvcodec libnpp"
overlay_cuda_filter_deps="ffnvcodec"
overlay_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
cuda_grid_filter_deps="ffnvcodec"
cuda_grid_filter_deps_any="cuda_nvcc cuda_llvm"
sharpen_npp_filter_deps="ffnvcodec libnpp"
ddagrab_filter_deps="d3d11va IDXGIOutput1 DXGI_OUTDUPL_FRAME_INFO"
+2
View File
@@ -410,6 +410,8 @@ OBJS-$(CONFIG_OSCILLOSCOPE_FILTER) += vf_datascope.o
OBJS-$(CONFIG_OVERLAY_FILTER) += vf_overlay.o framesync.o
OBJS-$(CONFIG_OVERLAY_CUDA_FILTER) += vf_overlay_cuda.o framesync.o vf_overlay_cuda.ptx.o \
cuda/load_helper.o
OBJS-$(CONFIG_CUDA_GRID_FILTER) += vf_cuda_grid.o framesync.o \
vf_cuda_grid.ptx.o cuda/load_helper.o
OBJS-$(CONFIG_OVERLAY_OPENCL_FILTER) += vf_overlay_opencl.o opencl.o \
opencl/overlay.o framesync.o
OBJS-$(CONFIG_OVERLAY_QSV_FILTER) += vf_overlay_qsv.o framesync.o
+1
View File
@@ -390,6 +390,7 @@ extern const AVFilter ff_vf_overlay_qsv;
extern const AVFilter ff_vf_overlay_vaapi;
extern const AVFilter ff_vf_overlay_vulkan;
extern const AVFilter ff_vf_overlay_cuda;
extern const AVFilter ff_vf_cuda_grid;
extern const AVFilter ff_vf_owdenoise;
extern const AVFilter ff_vf_pad;
extern const AVFilter ff_vf_pad_opencl;
File diff suppressed because it is too large Load Diff
+121
View File
@@ -0,0 +1,121 @@
/*
* cuda_grid overlay CUDA kernels.
*
* Алфа-блендинг poверх NV12 frame:
* - Alpha_Fill_Y / Alpha_Fill_UV: solid colour fill region (rect/dim)
* - Alpha_Blit_RGBA_Y / Alpha_Blit_RGBA_UV: blit RGBA atlas → NV12
* (для text Phase 4b-3 и icon Phase 4b-4)
*
* BT.709 limited-range conversion (HDTV). См. также vf_cuda_grid.c rgb_to_yuv709.
*
* Лицензия: LGPL-2.1+ (соответствует FFmpeg)
*/
#include "cuda/vector_helpers.cuh"
extern "C" {
/* Solid colour α-blend на Y plane.
* dst: pointer на Y plane base
* dst_pitch: bytes per row
* rx, ry, rw, rh: region rect (pixels, must be in-bounds)
* fill: 0..255 (Y component to blend in)
* alpha: 0..255 (255 = fully opaque)
*/
__global__ void Alpha_Fill_Y(unsigned char *dst, int dst_pitch,
int rx, int ry, int rw, int rh,
int fill, int alpha)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x >= rw || y >= rh) return;
unsigned char *p = dst + (ry + y) * dst_pitch + (rx + x);
int cur = *p;
*p = (unsigned char)((fill * alpha + cur * (255 - alpha)) / 255);
}
/* Solid colour α-blend на UV plane (NV12 interleaved).
* rx, ry: in chroma plane coords (= full-res Y x/2, y/2)
* rw, rh: also в chroma coords
*/
__global__ void Alpha_Fill_UV(unsigned char *dst, int dst_pitch,
int rx, int ry, int rw, int rh,
int fill_u, int fill_v, int alpha)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x >= rw || y >= rh) return;
unsigned char *p = dst + (ry + y) * dst_pitch + (rx + x) * 2;
int cu = p[0], cv = p[1];
p[0] = (unsigned char)((fill_u * alpha + cu * (255 - alpha)) / 255);
p[1] = (unsigned char)((fill_v * alpha + cv * (255 - alpha)) / 255);
}
/* Blit RGBA atlas → Y plane с α-blending.
* dst, dst_pitch: Y plane
* dx, dy: destination pixel offset (full-res coords)
* atlas, atlas_pitch: RGBA source (interleaved R,G,B,A bytes), pitch в bytes
* w, h: atlas dimensions (in pixels)
* extra_alpha: дополнительный множитель (0..255) — overlay-level opacity
*/
__global__ void Alpha_Blit_RGBA_Y(unsigned char *dst, int dst_pitch,
int dx, int dy,
const unsigned char *atlas, int atlas_pitch,
int w, int h, int extra_alpha)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x >= w || y >= h) return;
const unsigned char *sp = atlas + y * atlas_pitch + x * 4;
int r = sp[0], g = sp[1], b = sp[2], a = sp[3];
a = a * extra_alpha / 255;
if (a == 0) return;
int Y = (int)(0.183f * r + 0.614f * g + 0.062f * b) + 16;
Y = Y < 0 ? 0 : (Y > 255 ? 255 : Y);
unsigned char *p = dst + (dy + y) * dst_pitch + (dx + x);
int cur = *p;
*p = (unsigned char)((Y * a + cur * (255 - a)) / 255);
}
/* Blit RGBA atlas → UV plane с 4:2:0 chroma subsampling.
* dst, dst_pitch: UV plane
* dx, dy: destination pixel offset (full-res coords; обе должны быть кратны 2)
* atlas, atlas_pitch: RGBA source
* w, h: atlas dimensions (full-res; UV operates на w/2 × h/2)
* extra_alpha: 0..255
*/
__global__ void Alpha_Blit_RGBA_UV(unsigned char *dst, int dst_pitch,
int dx, int dy,
const unsigned char *atlas, int atlas_pitch,
int w, int h, int extra_alpha)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int hw = w / 2, hh = h / 2;
if (x >= hw || y >= hh) return;
int sx = x * 2, sy = y * 2;
const unsigned char *row0 = atlas + sy * atlas_pitch;
const unsigned char *row1 = atlas + (sy + 1) * atlas_pitch;
int r = (row0[sx*4+0] + row0[(sx+1)*4+0] + row1[sx*4+0] + row1[(sx+1)*4+0]) >> 2;
int g = (row0[sx*4+1] + row0[(sx+1)*4+1] + row1[sx*4+1] + row1[(sx+1)*4+1]) >> 2;
int b = (row0[sx*4+2] + row0[(sx+1)*4+2] + row1[sx*4+2] + row1[(sx+1)*4+2]) >> 2;
int a = (row0[sx*4+3] + row0[(sx+1)*4+3] + row1[sx*4+3] + row1[(sx+1)*4+3]) >> 2;
a = a * extra_alpha / 255;
if (a == 0) return;
int U = (int)(-0.101f * r - 0.339f * g + 0.439f * b) + 128;
int V = (int)( 0.439f * r - 0.399f * g - 0.040f * b) + 128;
U = U < 0 ? 0 : (U > 255 ? 255 : U);
V = V < 0 ? 0 : (V > 255 ? 255 : V);
int du = dx / 2 + x;
int dv = dy / 2 + y;
unsigned char *p = dst + dv * dst_pitch + du * 2;
int cu = p[0], cv = p[1];
p[0] = (unsigned char)((U * a + cu * (255 - a)) / 255);
p[1] = (unsigned char)((V * a + cv * (255 - a)) / 255);
}
} /* extern "C" */