@@ -0,0 +1,359 @@
/*
* cuda_grid — GPU-native video grid composer для FFmpeg 7.x.
*
* Принимает N CUDA-frames на входе, выдаёт один composed frame с N cells
* в layout. End-to-end CUDA (без CPU round-trip).
*
* Phase 1 (MVP): fixed quad layout 2× 2, 4 NV12-inputs одинакового размера,
* output size = 2W × 2H, без scaling. Композиция через cuMemcpy2DAsync per
* Y/UV plane на каждый input → soответствующую quadrant'у output.
*
* Future phases (см. gx/vf-cuda-grid#1):
* - Phase 2: dynamic layouts + per-cell scaling
* - Phase 3: runtime layout switching через process_command (ZMQ)
* - Phase 4+: overlay primitives (rect/text/icon/image/dim/graph/chat)
*
* Лицензия: LGPL-2.1+ (соответствует FFmpeg)
*/
# include "config_components.h"
# include "libavutil/common.h"
# include "libavutil/hwcontext.h"
# include "libavutil/hwcontext_cuda_internal.h"
# include "libavutil/log.h"
# include "libavutil/mem.h"
# include "libavutil/opt.h"
# include "libavutil/pixdesc.h"
# include "avfilter.h"
# include "filters.h"
# include "formats.h"
# include "framesync.h"
# include "video.h"
# define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x)
# define CUDA_GRID_INPUTS 4 /* Phase 1: fixed quad */
typedef struct CudaGridContext {
const AVClass * class ;
AVBufferRef * hw_device_ctx ;
AVCUDADeviceContext * hwctx ;
CUcontext cu_ctx ;
CUstream cu_stream ;
FFFrameSync fs ;
/* Output dimensions (computed in config_output) */
int out_width ;
int out_height ;
/* Per-cell target rectangles в output frame.
* Phase 1 hardcode: 4 ячейки 2× 2 (top-left, top-right, bottom-left, bottom-right). */
struct {
int x , y , w , h ;
} cells [ CUDA_GRID_INPUTS ] ;
} CudaGridContext ;
/* ─── Composition: copy одного input plane в target region output ──────── */
static int copy_input_plane ( AVFilterContext * ctx ,
CUdeviceptr src_data ,
int src_pitch ,
int src_w ,
int src_h ,
CUdeviceptr dst_data ,
int dst_pitch ,
int dst_x ,
int dst_y ,
int bytes_per_pixel )
{
CudaGridContext * s = ctx - > priv ;
CUDA_MEMCPY2D cpy = {
. srcMemoryType = CU_MEMORYTYPE_DEVICE ,
. srcDevice = src_data ,
. srcPitch = src_pitch ,
. dstMemoryType = CU_MEMORYTYPE_DEVICE ,
. dstDevice = dst_data ,
. dstXInBytes = ( size_t ) dst_x * bytes_per_pixel ,
. dstY = dst_y ,
. dstPitch = dst_pitch ,
. WidthInBytes = ( size_t ) src_w * bytes_per_pixel ,
. Height = src_h ,
} ;
return CHECK_CU ( s - > hwctx - > internal - > cuda_dl - > cuMemcpy2DAsync ( & cpy , s - > cu_stream ) ) ;
}
/* ─── Framesync callback — N frames а re ready, compose ────────────────── */
static int cuda_grid_compose ( FFFrameSync * fs )
{
AVFilterContext * ctx = fs - > parent ;
AVFilterLink * outlink = ctx - > outputs [ 0 ] ;
CudaGridContext * s = ctx - > priv ;
AVFrame * out = NULL ;
AVFrame * in [ CUDA_GRID_INPUTS ] = { 0 } ;
CUcontext dummy ;
int ret ;
/* Сбор всех N input frames из framesync */
for ( int i = 0 ; i < CUDA_GRID_INPUTS ; i + + ) {
ret = ff_framesync_get_frame ( fs , i , & in [ i ] , 0 ) ;
if ( ret < 0 )
return ret ;
if ( ! in [ i ] ) {
av_log ( ctx , AV_LOG_WARNING , " input %d not ready, skipping frame \n " , i ) ;
return 0 ;
}
}
/* Output frame из output's hw_frames_pool */
out = ff_get_video_buffer ( outlink , s - > out_width , s - > out_height ) ;
if ( ! out )
return AVERROR ( ENOMEM ) ;
/* Copy props (timestamps, color metadata) от первого input */
ret = av_frame_copy_props ( out , in [ 0 ] ) ;
if ( ret < 0 )
goto fail ;
out - > width = s - > out_width ;
out - > height = s - > out_height ;
/* CUDA context push для всех cuMemcpy в этом filter call */
ret = CHECK_CU ( s - > hwctx - > internal - > cuda_dl - > cuCtxPushCurrent ( s - > cu_ctx ) ) ;
if ( ret < 0 )
goto fail ;
/* Для каждого input — copy Y plane + UV plane в свою quadrant.
* NV12 layout: data[0] = Y, data[1] = UV interleaved. linesize[0/1] = pitch. */
for ( int i = 0 ; i < CUDA_GRID_INPUTS ; i + + ) {
AVFrame * src = in [ i ] ;
int cx = s - > cells [ i ] . x ;
int cy = s - > cells [ i ] . y ;
int cw = s - > cells [ i ] . w ;
int ch = s - > cells [ i ] . h ;
if ( src - > width ! = cw | | src - > height ! = ch ) {
av_log ( ctx , AV_LOG_ERROR ,
" input %d size %dx%d != expected cell size %dx%d "
" (Phase 1: no scaling, all inputs must match cell size) \n " ,
i , src - > width , src - > height , cw , ch ) ;
CHECK_CU ( s - > hwctx - > internal - > cuda_dl - > cuCtxPopCurrent ( & dummy ) ) ;
ret = AVERROR ( EINVAL ) ;
goto fail ;
}
/* Y plane (full resolution, 1 byte per pixel) */
ret = copy_input_plane ( ctx ,
( CUdeviceptr ) src - > data [ 0 ] , src - > linesize [ 0 ] ,
src - > width , src - > height ,
( CUdeviceptr ) out - > data [ 0 ] , out - > linesize [ 0 ] ,
cx , cy , 1 ) ;
if ( ret < 0 ) {
CHECK_CU ( s - > hwctx - > internal - > cuda_dl - > cuCtxPopCurrent ( & dummy ) ) ;
goto fail ;
}
/* UV plane (half resolution для NV12, но 2 bytes per "pixel" — interleaved UV) */
ret = copy_input_plane ( ctx ,
( CUdeviceptr ) src - > data [ 1 ] , src - > linesize [ 1 ] ,
src - > width / 2 , src - > height / 2 ,
( CUdeviceptr ) out - > data [ 1 ] , out - > linesize [ 1 ] ,
cx / 2 , cy / 2 , 2 ) ;
if ( ret < 0 ) {
CHECK_CU ( s - > hwctx - > internal - > cuda_dl - > cuCtxPopCurrent ( & dummy ) ) ;
goto fail ;
}
}
CHECK_CU ( s - > hwctx - > internal - > cuda_dl - > cuCtxPopCurrent ( & dummy ) ) ;
return ff_filter_frame ( outlink , out ) ;
fail :
av_frame_free ( & out ) ;
return ret ;
}
/* ─── Lifecycle: init / uninit / query_formats / config_input / config_output ─ */
static av_cold int cuda_grid_init ( AVFilterContext * ctx )
{
CudaGridContext * s = ctx - > priv ;
/* Сами inputs регистрируем в filter struct (см. внизу — нельзя AVFILTER_INPUT_COUNT_MAX
* без явных AVFilterPad'ов). Phase 1 fix=4. */
( void ) s ;
return 0 ;
}
static av_cold void cuda_grid_uninit ( AVFilterContext * ctx )
{
CudaGridContext * s = ctx - > priv ;
ff_framesync_uninit ( & s - > fs ) ;
av_buffer_unref ( & s - > hw_device_ctx ) ;
}
static int cuda_grid_config_input ( AVFilterLink * inlink )
{
AVFilterContext * ctx = inlink - > src ;
FilterLink * inl = ff_filter_link ( inlink ) ;
if ( ! inl - > hw_frames_ctx | | ! inl - > hw_frames_ctx - > data ) {
av_log ( ctx , AV_LOG_ERROR , " input %d: software pixel format не поддерживается \n " ,
FF_INLINK_IDX ( inlink ) ) ;
return AVERROR ( EINVAL ) ;
}
return 0 ;
}
static int cuda_grid_config_output ( AVFilterLink * outlink )
{
AVFilterContext * ctx = outlink - > src ;
CudaGridContext * s = ctx - > priv ;
AVFilterLink * in0 = ctx - > inputs [ 0 ] ;
FilterLink * inl0 = ff_filter_link ( in0 ) ;
FilterLink * outl = ff_filter_link ( outlink ) ;
AVHWFramesContext * hwfc0 ;
int W , H , ret ;
if ( ! inl0 - > hw_frames_ctx )
return AVERROR ( EINVAL ) ;
hwfc0 = ( AVHWFramesContext * ) inl0 - > hw_frames_ctx - > data ;
if ( hwfc0 - > sw_format ! = AV_PIX_FMT_NV12 ) {
av_log ( ctx , AV_LOG_ERROR ,
" Phase 1 supports only NV12, got %s \n " ,
av_get_pix_fmt_name ( hwfc0 - > sw_format ) ) ;
return AVERROR ( EINVAL ) ;
}
/* Все inputs должны иметь одинаковый device и sw_format (Phase 1 also same size) */
W = in0 - > w ;
H = in0 - > h ;
for ( int i = 1 ; i < CUDA_GRID_INPUTS ; i + + ) {
AVFilterLink * inN = ctx - > inputs [ i ] ;
FilterLink * ilN = ff_filter_link ( inN ) ;
AVHWFramesContext * hN ;
if ( ! ilN - > hw_frames_ctx )
return AVERROR ( EINVAL ) ;
hN = ( AVHWFramesContext * ) ilN - > hw_frames_ctx - > data ;
if ( hN - > device_ctx ! = hwfc0 - > device_ctx ) {
av_log ( ctx , AV_LOG_ERROR , " input %d device mismatch \n " , i ) ;
return AVERROR ( EINVAL ) ;
}
if ( hN - > sw_format ! = hwfc0 - > sw_format ) {
av_log ( ctx , AV_LOG_ERROR , " input %d sw_format mismatch \n " , i ) ;
return AVERROR ( EINVAL ) ;
}
if ( inN - > w ! = W | | inN - > h ! = H ) {
av_log ( ctx , AV_LOG_ERROR ,
" Phase 1: input %d size %dx%d != input 0 size %dx%d. "
" В этой фазе scaling не поддерживается, все inputs должны быть одного размера. \n " ,
i , inN - > w , inN - > h , W , H ) ;
return AVERROR ( EINVAL ) ;
}
}
/* Output = 2W × 2H для quad layout */
s - > out_width = 2 * W ;
s - > out_height = 2 * H ;
outlink - > w = s - > out_width ;
outlink - > h = s - > out_height ;
/* Hardcoded quad cell positions */
s - > cells [ 0 ] . x = 0 ; s - > cells [ 0 ] . y = 0 ; s - > cells [ 0 ] . w = W ; s - > cells [ 0 ] . h = H ;
s - > cells [ 1 ] . x = W ; s - > cells [ 1 ] . y = 0 ; s - > cells [ 1 ] . w = W ; s - > cells [ 1 ] . h = H ;
s - > cells [ 2 ] . x = 0 ; s - > cells [ 2 ] . y = H ; s - > cells [ 2 ] . w = W ; s - > cells [ 2 ] . h = H ;
s - > cells [ 3 ] . x = W ; s - > cells [ 3 ] . y = H ; s - > cells [ 3 ] . w = W ; s - > cells [ 3 ] . h = H ;
/* Setup CUDA device + stream context из input 0 */
AVHWDeviceContext * hwdev = hwfc0 - > device_ctx ;
s - > hwctx = ( AVCUDADeviceContext * ) hwdev - > hwctx ;
s - > cu_ctx = s - > hwctx - > cuda_ctx ;
s - > cu_stream = s - > hwctx - > stream ;
/* Аллокация output hw_frames_ctx — copy от input #0 с обновлёнными размерами */
AVBufferRef * out_ref = av_hwframe_ctx_alloc ( hwfc0 - > device_ref ) ;
if ( ! out_ref )
return AVERROR ( ENOMEM ) ;
AVHWFramesContext * out_hwfc = ( AVHWFramesContext * ) out_ref - > data ;
out_hwfc - > format = AV_PIX_FMT_CUDA ;
out_hwfc - > sw_format = AV_PIX_FMT_NV12 ;
out_hwfc - > width = s - > out_width ;
out_hwfc - > height = s - > out_height ;
out_hwfc - > initial_pool_size = 4 ;
ret = av_hwframe_ctx_init ( out_ref ) ;
if ( ret < 0 ) {
av_buffer_unref ( & out_ref ) ;
return ret ;
}
outl - > hw_frames_ctx = out_ref ;
/* Setup framesync для lock-step pull от N inputs */
ret = ff_framesync_init ( & s - > fs , ctx , CUDA_GRID_INPUTS ) ;
if ( ret < 0 )
return ret ;
for ( int i = 0 ; i < CUDA_GRID_INPUTS ; i + + ) {
FFFrameSyncIn * fs_in = & s - > fs . in [ i ] ;
fs_in - > time_base = ctx - > inputs [ i ] - > time_base ;
fs_in - > sync = 1 ;
fs_in - > before = EXT_STOP ;
fs_in - > after = EXT_INFINITY ;
}
s - > fs . opaque = s ;
s - > fs . on_event = cuda_grid_compose ;
outlink - > time_base = ctx - > inputs [ 0 ] - > time_base ;
return ff_framesync_configure ( & s - > fs ) ;
}
static int cuda_grid_activate ( AVFilterContext * ctx )
{
CudaGridContext * s = ctx - > priv ;
return ff_framesync_activate ( & s - > fs ) ;
}
/* ─── Filter registration ──────────────────────────────────────────────── */
static const AVOption cuda_grid_options [ ] = {
/* Phase 1: no options. Phase 2 добавит `layout=`. */
{ NULL }
} ;
static const AVClass cuda_grid_class = {
. class_name = " cuda_grid " ,
. item_name = av_default_item_name ,
. option = cuda_grid_options ,
. version = LIBAVUTIL_VERSION_INT ,
. category = AV_CLASS_CATEGORY_FILTER ,
} ;
static const AVFilterPad cuda_grid_inputs [ ] = {
{ . name = " input0 " , . type = AVMEDIA_TYPE_VIDEO , . config_props = cuda_grid_config_input } ,
{ . name = " input1 " , . type = AVMEDIA_TYPE_VIDEO , . config_props = cuda_grid_config_input } ,
{ . name = " input2 " , . type = AVMEDIA_TYPE_VIDEO , . config_props = cuda_grid_config_input } ,
{ . name = " input3 " , . type = AVMEDIA_TYPE_VIDEO , . config_props = cuda_grid_config_input } ,
} ;
static const AVFilterPad cuda_grid_outputs [ ] = {
{ . name = " default " , . type = AVMEDIA_TYPE_VIDEO , . config_props = cuda_grid_config_output } ,
} ;
const AVFilter ff_vf_cuda_grid = {
. name = " cuda_grid " ,
. description = NULL_IF_CONFIG_SMALL ( " GPU-native video grid composer (CUDA). " ) ,
. priv_class = & cuda_grid_class ,
. priv_size = sizeof ( CudaGridContext ) ,
. init = cuda_grid_init ,
. uninit = cuda_grid_uninit ,
. activate = cuda_grid_activate ,
FILTER_INPUTS ( cuda_grid_inputs ) ,
FILTER_OUTPUTS ( cuda_grid_outputs ) ,
FILTER_SINGLE_PIXFMT ( AV_PIX_FMT_CUDA ) ,
. flags = AVFILTER_FLAG_HWDEVICE ,
. flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE ,
} ;