vf_cuda_grid: Phase 2b — NPP scaling для mixed-size inputs
- Add libnpp dependency в configure (cuda_grid_filter_deps) - #include <nppi.h>, nppSetStream(s->cu_stream) перед resize batch - Smart copy-or-scale в compose path: - src.size == cell.size → cuMemcpy2DAsync (fast path, zero overhead) - else → nppiResizeSqrPixel_8u_C1R для Y + _C2R для NV12 UV interleaved - NPPI_INTER_LINEAR interpolation (bilinear — стандартный для video) - Destination pointer offset через explicit pointer arithmetic (NPP не имеет dst_offset параметра, нужно сместить pSrc указатель) Это unblock'ает mixed-size cameras (parking 1920x1080 + gate_lpr 2688x1520 в одном grid с main_plus_preview layout — big cell scaled до 1280x1080, small cells scaled до 640x360). Phase 2 complete (2a + 2b). Phase 3 будет controller sidecar.
This commit is contained in:
@@ -3317,7 +3317,7 @@ thumbnail_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
|
||||
transpose_npp_filter_deps="ffnvcodec libnpp"
|
||||
overlay_cuda_filter_deps="ffnvcodec"
|
||||
overlay_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
|
||||
cuda_grid_filter_deps="ffnvcodec"
|
||||
cuda_grid_filter_deps="ffnvcodec libnpp"
|
||||
sharpen_npp_filter_deps="ffnvcodec libnpp"
|
||||
|
||||
ddagrab_filter_deps="d3d11va IDXGIOutput1 DXGI_OUTDUPL_FRAME_INFO"
|
||||
|
||||
+68
-28
@@ -19,6 +19,8 @@
|
||||
|
||||
#include "config_components.h"
|
||||
|
||||
#include <nppi.h>
|
||||
|
||||
#include "libavutil/avstring.h"
|
||||
#include "libavutil/common.h"
|
||||
#include "libavutil/cuda_check.h"
|
||||
@@ -213,6 +215,9 @@ static int cuda_grid_compose(FFFrameSync *fs)
|
||||
if (ret < 0)
|
||||
goto fail;
|
||||
|
||||
/* NPP в этом thread'е работает в нашем CUDA stream */
|
||||
nppSetStream(s->cu_stream);
|
||||
|
||||
for (i = 0; i < nb; i++) {
|
||||
AVFrame *src = in[i];
|
||||
int cx = s->cell_px[i].x;
|
||||
@@ -220,36 +225,71 @@ static int cuda_grid_compose(FFFrameSync *fs)
|
||||
int cw = s->cell_px[i].w;
|
||||
int ch = s->cell_px[i].h;
|
||||
|
||||
if (src->width != cw || src->height != ch) {
|
||||
av_log(ctx, AV_LOG_ERROR,
|
||||
"input %d size %dx%d != cell size %dx%d "
|
||||
"(Phase 2a: no scaling — Phase 2b добавит NPP resize)\n",
|
||||
i, src->width, src->height, cw, ch);
|
||||
CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
|
||||
ret = AVERROR(EINVAL);
|
||||
goto fail;
|
||||
}
|
||||
if (src->width == cw && src->height == ch) {
|
||||
/* Fast path: same size — memcpy без NPP overhead */
|
||||
ret = copy_input_plane(ctx,
|
||||
(CUdeviceptr)src->data[0], src->linesize[0],
|
||||
src->width, src->height,
|
||||
(CUdeviceptr)out->data[0], out->linesize[0],
|
||||
cx, cy, 1);
|
||||
if (ret < 0) {
|
||||
CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
|
||||
goto fail;
|
||||
}
|
||||
ret = copy_input_plane(ctx,
|
||||
(CUdeviceptr)src->data[1], src->linesize[1],
|
||||
src->width / 2, src->height / 2,
|
||||
(CUdeviceptr)out->data[1], out->linesize[1],
|
||||
cx / 2, cy / 2, 2);
|
||||
if (ret < 0) {
|
||||
CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
|
||||
goto fail;
|
||||
}
|
||||
} else {
|
||||
/* Phase 2b: NPP scaling. Output обязательно chroma-aligned (cell coords
|
||||
* выровнены до 2 в config_output). */
|
||||
NppStatus npp_err;
|
||||
double xfactor_y = (double)cw / src->width;
|
||||
double yfactor_y = (double)ch / src->height;
|
||||
double xfactor_uv = (double)(cw / 2) / (src->width / 2);
|
||||
double yfactor_uv = (double)(ch / 2) / (src->height / 2);
|
||||
|
||||
/* Y plane */
|
||||
ret = copy_input_plane(ctx,
|
||||
(CUdeviceptr)src->data[0], src->linesize[0],
|
||||
src->width, src->height,
|
||||
(CUdeviceptr)out->data[0], out->linesize[0],
|
||||
cx, cy, 1);
|
||||
if (ret < 0) {
|
||||
CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
|
||||
goto fail;
|
||||
}
|
||||
uint8_t *dst_y_ptr = out->data[0] + (size_t)cy * out->linesize[0] + cx;
|
||||
uint8_t *dst_uv_ptr = out->data[1] + (size_t)(cy / 2) * out->linesize[1] + cx;
|
||||
/* dst_uv X в bytes = cx (2 bytes per UV-pair × cx/2 = cx bytes) */
|
||||
|
||||
/* UV plane (NV12: половинное разрешение, 2 bytes per "pixel") */
|
||||
ret = copy_input_plane(ctx,
|
||||
(CUdeviceptr)src->data[1], src->linesize[1],
|
||||
src->width / 2, src->height / 2,
|
||||
(CUdeviceptr)out->data[1], out->linesize[1],
|
||||
cx / 2, cy / 2, 2);
|
||||
if (ret < 0) {
|
||||
CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
|
||||
goto fail;
|
||||
/* Y plane — 1 channel (luma) */
|
||||
npp_err = nppiResizeSqrPixel_8u_C1R(
|
||||
src->data[0], (NppiSize){src->width, src->height},
|
||||
src->linesize[0], (NppiRect){0, 0, src->width, src->height},
|
||||
dst_y_ptr, out->linesize[0],
|
||||
(NppiRect){0, 0, cw, ch},
|
||||
xfactor_y, yfactor_y, 0.0, 0.0,
|
||||
NPPI_INTER_LINEAR);
|
||||
if (npp_err != NPP_SUCCESS) {
|
||||
av_log(ctx, AV_LOG_ERROR,
|
||||
"input %d Y plane NPP resize %dx%d→%dx%d failed: %d\n",
|
||||
i, src->width, src->height, cw, ch, npp_err);
|
||||
CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
|
||||
ret = AVERROR_EXTERNAL;
|
||||
goto fail;
|
||||
}
|
||||
|
||||
/* UV plane — 2 channels interleaved (NV12 chroma) */
|
||||
npp_err = nppiResizeSqrPixel_8u_C2R(
|
||||
src->data[1], (NppiSize){src->width / 2, src->height / 2},
|
||||
src->linesize[1], (NppiRect){0, 0, src->width / 2, src->height / 2},
|
||||
dst_uv_ptr, out->linesize[1],
|
||||
(NppiRect){0, 0, cw / 2, ch / 2},
|
||||
xfactor_uv, yfactor_uv, 0.0, 0.0,
|
||||
NPPI_INTER_LINEAR);
|
||||
if (npp_err != NPP_SUCCESS) {
|
||||
av_log(ctx, AV_LOG_ERROR,
|
||||
"input %d UV plane NPP resize failed: %d\n", i, npp_err);
|
||||
CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
|
||||
ret = AVERROR_EXTERNAL;
|
||||
goto fail;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user