From 11f310061a3ac0290b790c565d8d4cb86cabdbf5 Mon Sep 17 00:00:00 2001 From: gx Date: Tue, 19 May 2026 21:20:04 +0100 Subject: [PATCH] =?UTF-8?q?vf=5Fcuda=5Fgrid:=20Phase=202b=20=E2=80=94=20NP?= =?UTF-8?q?P=20scaling=20=D0=B4=D0=BB=D1=8F=20mixed-size=20inputs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add libnpp dependency в configure (cuda_grid_filter_deps) - #include , nppSetStream(s->cu_stream) перед resize batch - Smart copy-or-scale в compose path: - src.size == cell.size → cuMemcpy2DAsync (fast path, zero overhead) - else → nppiResizeSqrPixel_8u_C1R для Y + _C2R для NV12 UV interleaved - NPPI_INTER_LINEAR interpolation (bilinear — стандартный для video) - Destination pointer offset через explicit pointer arithmetic (NPP не имеет dst_offset параметра, нужно сместить pSrc указатель) Это unblock'ает mixed-size cameras (parking 1920x1080 + gate_lpr 2688x1520 в одном grid с main_plus_preview layout — big cell scaled до 1280x1080, small cells scaled до 640x360). Phase 2 complete (2a + 2b). Phase 3 будет controller sidecar. --- configure | 2 +- libavfilter/vf_cuda_grid.c | 96 +++++++++++++++++++++++++++----------- 2 files changed, 69 insertions(+), 29 deletions(-) diff --git a/configure b/configure index 9c60cb7..cc6a635 100755 --- a/configure +++ b/configure @@ -3317,7 +3317,7 @@ thumbnail_cuda_filter_deps_any="cuda_nvcc cuda_llvm" transpose_npp_filter_deps="ffnvcodec libnpp" overlay_cuda_filter_deps="ffnvcodec" overlay_cuda_filter_deps_any="cuda_nvcc cuda_llvm" -cuda_grid_filter_deps="ffnvcodec" +cuda_grid_filter_deps="ffnvcodec libnpp" sharpen_npp_filter_deps="ffnvcodec libnpp" ddagrab_filter_deps="d3d11va IDXGIOutput1 DXGI_OUTDUPL_FRAME_INFO" diff --git a/libavfilter/vf_cuda_grid.c b/libavfilter/vf_cuda_grid.c index bfc4a53..cfad54b 100644 --- a/libavfilter/vf_cuda_grid.c +++ b/libavfilter/vf_cuda_grid.c @@ -19,6 +19,8 @@ #include "config_components.h" +#include + #include "libavutil/avstring.h" #include "libavutil/common.h" #include "libavutil/cuda_check.h" @@ -213,6 +215,9 @@ static int cuda_grid_compose(FFFrameSync *fs) if (ret < 0) goto fail; + /* NPP в этом thread'е работает в нашем CUDA stream */ + nppSetStream(s->cu_stream); + for (i = 0; i < nb; i++) { AVFrame *src = in[i]; int cx = s->cell_px[i].x; @@ -220,36 +225,71 @@ static int cuda_grid_compose(FFFrameSync *fs) int cw = s->cell_px[i].w; int ch = s->cell_px[i].h; - if (src->width != cw || src->height != ch) { - av_log(ctx, AV_LOG_ERROR, - "input %d size %dx%d != cell size %dx%d " - "(Phase 2a: no scaling — Phase 2b добавит NPP resize)\n", - i, src->width, src->height, cw, ch); - CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); - ret = AVERROR(EINVAL); - goto fail; - } + if (src->width == cw && src->height == ch) { + /* Fast path: same size — memcpy без NPP overhead */ + ret = copy_input_plane(ctx, + (CUdeviceptr)src->data[0], src->linesize[0], + src->width, src->height, + (CUdeviceptr)out->data[0], out->linesize[0], + cx, cy, 1); + if (ret < 0) { + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); + goto fail; + } + ret = copy_input_plane(ctx, + (CUdeviceptr)src->data[1], src->linesize[1], + src->width / 2, src->height / 2, + (CUdeviceptr)out->data[1], out->linesize[1], + cx / 2, cy / 2, 2); + if (ret < 0) { + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); + goto fail; + } + } else { + /* Phase 2b: NPP scaling. Output обязательно chroma-aligned (cell coords + * выровнены до 2 в config_output). */ + NppStatus npp_err; + double xfactor_y = (double)cw / src->width; + double yfactor_y = (double)ch / src->height; + double xfactor_uv = (double)(cw / 2) / (src->width / 2); + double yfactor_uv = (double)(ch / 2) / (src->height / 2); - /* Y plane */ - ret = copy_input_plane(ctx, - (CUdeviceptr)src->data[0], src->linesize[0], - src->width, src->height, - (CUdeviceptr)out->data[0], out->linesize[0], - cx, cy, 1); - if (ret < 0) { - CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); - goto fail; - } + uint8_t *dst_y_ptr = out->data[0] + (size_t)cy * out->linesize[0] + cx; + uint8_t *dst_uv_ptr = out->data[1] + (size_t)(cy / 2) * out->linesize[1] + cx; + /* dst_uv X в bytes = cx (2 bytes per UV-pair × cx/2 = cx bytes) */ - /* UV plane (NV12: половинное разрешение, 2 bytes per "pixel") */ - ret = copy_input_plane(ctx, - (CUdeviceptr)src->data[1], src->linesize[1], - src->width / 2, src->height / 2, - (CUdeviceptr)out->data[1], out->linesize[1], - cx / 2, cy / 2, 2); - if (ret < 0) { - CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); - goto fail; + /* Y plane — 1 channel (luma) */ + npp_err = nppiResizeSqrPixel_8u_C1R( + src->data[0], (NppiSize){src->width, src->height}, + src->linesize[0], (NppiRect){0, 0, src->width, src->height}, + dst_y_ptr, out->linesize[0], + (NppiRect){0, 0, cw, ch}, + xfactor_y, yfactor_y, 0.0, 0.0, + NPPI_INTER_LINEAR); + if (npp_err != NPP_SUCCESS) { + av_log(ctx, AV_LOG_ERROR, + "input %d Y plane NPP resize %dx%d→%dx%d failed: %d\n", + i, src->width, src->height, cw, ch, npp_err); + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); + ret = AVERROR_EXTERNAL; + goto fail; + } + + /* UV plane — 2 channels interleaved (NV12 chroma) */ + npp_err = nppiResizeSqrPixel_8u_C2R( + src->data[1], (NppiSize){src->width / 2, src->height / 2}, + src->linesize[1], (NppiRect){0, 0, src->width / 2, src->height / 2}, + dst_uv_ptr, out->linesize[1], + (NppiRect){0, 0, cw / 2, ch / 2}, + xfactor_uv, yfactor_uv, 0.0, 0.0, + NPPI_INTER_LINEAR); + if (npp_err != NPP_SUCCESS) { + av_log(ctx, AV_LOG_ERROR, + "input %d UV plane NPP resize failed: %d\n", i, npp_err); + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); + ret = AVERROR_EXTERNAL; + goto fail; + } } }