diff --git a/configure b/configure index cc6a635..9c60cb7 100755 --- a/configure +++ b/configure @@ -3317,7 +3317,7 @@ thumbnail_cuda_filter_deps_any="cuda_nvcc cuda_llvm" transpose_npp_filter_deps="ffnvcodec libnpp" overlay_cuda_filter_deps="ffnvcodec" overlay_cuda_filter_deps_any="cuda_nvcc cuda_llvm" -cuda_grid_filter_deps="ffnvcodec libnpp" +cuda_grid_filter_deps="ffnvcodec" sharpen_npp_filter_deps="ffnvcodec libnpp" ddagrab_filter_deps="d3d11va IDXGIOutput1 DXGI_OUTDUPL_FRAME_INFO" diff --git a/libavfilter/vf_cuda_grid.c b/libavfilter/vf_cuda_grid.c index cfad54b..eca57f0 100644 --- a/libavfilter/vf_cuda_grid.c +++ b/libavfilter/vf_cuda_grid.c @@ -4,13 +4,25 @@ * Принимает N CUDA-frames на входе, выдаёт один composed frame с N cells * в layout. End-to-end CUDA (без CPU round-trip). * - * Phase 2a: layout templates (single/dual_h/dual_v/quad/main_plus_preview/ + * Phase 2: layout templates (single/dual_h/dual_v/quad/main_plus_preview/ * six_grid/nine_grid/sixteen_grid/panoramic), dynamic nb_inputs, output size * через option (default 1920×1080). Cell rects = normalized × output size. - * **Scaling пока нет** — каждый input должен быть точно cell size (Phase 2b NPP). + * + * **Scaling delegated to upstream `scale_npp`** filter (Unix philosophy + + * production-tested NPP code). NPP не имеет nppiResize_8u_C2R для NV12 UV + * interleaved, поэтому in-filter scaling = either two intermediate plane + * buffers либо custom CUDA kernel — оба больше work чем filter chain'ить: + * + * ffmpeg ... -filter_complex \ + * "[0]scale_npp=1280:1080[s0]; \ + * [1]scale_npp=640:360[s1]; \ + * [2]scale_npp=640:360[s2]; \ + * [3]scale_npp=640:360[s3]; \ + * [s0][s1][s2][s3]cuda_grid=layout=main_plus_preview[out]" + * + * Controller (Phase 3) auto-generates filter graph с scale_npp per input. * * Future phases (см. gx/vf-cuda-grid#1): - * - Phase 2b: per-cell scaling через libnpp (mixed-size inputs) * - Phase 3: runtime layout switching через process_command (ZMQ) * - Phase 4+: overlay primitives (rect/text/icon/image/dim/graph/chat) * @@ -19,8 +31,6 @@ #include "config_components.h" -#include - #include "libavutil/avstring.h" #include "libavutil/common.h" #include "libavutil/cuda_check.h" @@ -215,9 +225,6 @@ static int cuda_grid_compose(FFFrameSync *fs) if (ret < 0) goto fail; - /* NPP в этом thread'е работает в нашем CUDA stream */ - nppSetStream(s->cu_stream); - for (i = 0; i < nb; i++) { AVFrame *src = in[i]; int cx = s->cell_px[i].x; @@ -225,71 +232,38 @@ static int cuda_grid_compose(FFFrameSync *fs) int cw = s->cell_px[i].w; int ch = s->cell_px[i].h; - if (src->width == cw && src->height == ch) { - /* Fast path: same size — memcpy без NPP overhead */ - ret = copy_input_plane(ctx, - (CUdeviceptr)src->data[0], src->linesize[0], - src->width, src->height, - (CUdeviceptr)out->data[0], out->linesize[0], - cx, cy, 1); - if (ret < 0) { - CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); - goto fail; - } - ret = copy_input_plane(ctx, - (CUdeviceptr)src->data[1], src->linesize[1], - src->width / 2, src->height / 2, - (CUdeviceptr)out->data[1], out->linesize[1], - cx / 2, cy / 2, 2); - if (ret < 0) { - CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); - goto fail; - } - } else { - /* Phase 2b: NPP scaling. Output обязательно chroma-aligned (cell coords - * выровнены до 2 в config_output). */ - NppStatus npp_err; - double xfactor_y = (double)cw / src->width; - double yfactor_y = (double)ch / src->height; - double xfactor_uv = (double)(cw / 2) / (src->width / 2); - double yfactor_uv = (double)(ch / 2) / (src->height / 2); + if (src->width != cw || src->height != ch) { + av_log(ctx, AV_LOG_ERROR, + "input %d size %dx%d != cell size %dx%d. " + "cuda_grid не делает scaling — используй upstream scale_npp:\n" + " [in%d]scale_npp=%d:%d[scaled%d]; [scaled%d]...cuda_grid=layout=%s\n", + i, src->width, src->height, cw, ch, + i, cw, ch, i, i, s->layout->name); + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); + ret = AVERROR(EINVAL); + goto fail; + } - uint8_t *dst_y_ptr = out->data[0] + (size_t)cy * out->linesize[0] + cx; - uint8_t *dst_uv_ptr = out->data[1] + (size_t)(cy / 2) * out->linesize[1] + cx; - /* dst_uv X в bytes = cx (2 bytes per UV-pair × cx/2 = cx bytes) */ + /* Y plane (1 byte per pixel) */ + ret = copy_input_plane(ctx, + (CUdeviceptr)src->data[0], src->linesize[0], + src->width, src->height, + (CUdeviceptr)out->data[0], out->linesize[0], + cx, cy, 1); + if (ret < 0) { + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); + goto fail; + } - /* Y plane — 1 channel (luma) */ - npp_err = nppiResizeSqrPixel_8u_C1R( - src->data[0], (NppiSize){src->width, src->height}, - src->linesize[0], (NppiRect){0, 0, src->width, src->height}, - dst_y_ptr, out->linesize[0], - (NppiRect){0, 0, cw, ch}, - xfactor_y, yfactor_y, 0.0, 0.0, - NPPI_INTER_LINEAR); - if (npp_err != NPP_SUCCESS) { - av_log(ctx, AV_LOG_ERROR, - "input %d Y plane NPP resize %dx%d→%dx%d failed: %d\n", - i, src->width, src->height, cw, ch, npp_err); - CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); - ret = AVERROR_EXTERNAL; - goto fail; - } - - /* UV plane — 2 channels interleaved (NV12 chroma) */ - npp_err = nppiResizeSqrPixel_8u_C2R( - src->data[1], (NppiSize){src->width / 2, src->height / 2}, - src->linesize[1], (NppiRect){0, 0, src->width / 2, src->height / 2}, - dst_uv_ptr, out->linesize[1], - (NppiRect){0, 0, cw / 2, ch / 2}, - xfactor_uv, yfactor_uv, 0.0, 0.0, - NPPI_INTER_LINEAR); - if (npp_err != NPP_SUCCESS) { - av_log(ctx, AV_LOG_ERROR, - "input %d UV plane NPP resize failed: %d\n", i, npp_err); - CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); - ret = AVERROR_EXTERNAL; - goto fail; - } + /* UV plane (NV12: half resolution, 2 bytes per "pixel") */ + ret = copy_input_plane(ctx, + (CUdeviceptr)src->data[1], src->linesize[1], + src->width / 2, src->height / 2, + (CUdeviceptr)out->data[1], out->linesize[1], + cx / 2, cy / 2, 2); + if (ret < 0) { + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); + goto fail; } }