From 11f310061a3ac0290b790c565d8d4cb86cabdbf5 Mon Sep 17 00:00:00 2001
From: gx <gx@goldix.org>
Date: Tue, 19 May 2026 21:20:04 +0100
Subject: [PATCH] =?UTF-8?q?vf=5Fcuda=5Fgrid:=20Phase=202b=20=E2=80=94=20NP?=
 =?UTF-8?q?P=20scaling=20=D0=B4=D0=BB=D1=8F=20mixed-size=20inputs?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add libnpp dependency в configure (cuda_grid_filter_deps)
- #include <nppi.h>, nppSetStream(s->cu_stream) перед resize batch
- Smart copy-or-scale в compose path:
  - src.size == cell.size → cuMemcpy2DAsync (fast path, zero overhead)
  - else → nppiResizeSqrPixel_8u_C1R для Y + _C2R для NV12 UV interleaved
- NPPI_INTER_LINEAR interpolation (bilinear — стандартный для video)
- Destination pointer offset через explicit pointer arithmetic
  (NPP не имеет dst_offset параметра, нужно сместить pSrc указатель)

Это unblock'ает mixed-size cameras (parking 1920x1080 + gate_lpr 2688x1520
в одном grid с main_plus_preview layout — big cell scaled до 1280x1080,
small cells scaled до 640x360).

Phase 2 complete (2a + 2b). Phase 3 будет controller sidecar.
---
 configure                  |  2 +-
 libavfilter/vf_cuda_grid.c | 96 +++++++++++++++++++++++++++-----------
 2 files changed, 69 insertions(+), 29 deletions(-)

diff --git a/configure b/configure
index 9c60cb7..cc6a635 100755
--- a/configure
+++ b/configure
@@ -3317,7 +3317,7 @@ thumbnail_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 transpose_npp_filter_deps="ffnvcodec libnpp"
 overlay_cuda_filter_deps="ffnvcodec"
 overlay_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
-cuda_grid_filter_deps="ffnvcodec"
+cuda_grid_filter_deps="ffnvcodec libnpp"
 sharpen_npp_filter_deps="ffnvcodec libnpp"
 
 ddagrab_filter_deps="d3d11va IDXGIOutput1 DXGI_OUTDUPL_FRAME_INFO"
diff --git a/libavfilter/vf_cuda_grid.c b/libavfilter/vf_cuda_grid.c
index bfc4a53..cfad54b 100644
--- a/libavfilter/vf_cuda_grid.c
+++ b/libavfilter/vf_cuda_grid.c
@@ -19,6 +19,8 @@
 
 #include "config_components.h"
 
+#include <nppi.h>
+
 #include "libavutil/avstring.h"
 #include "libavutil/common.h"
 #include "libavutil/cuda_check.h"
@@ -213,6 +215,9 @@ static int cuda_grid_compose(FFFrameSync *fs)
     if (ret < 0)
         goto fail;
 
+    /* NPP в этом thread'е работает в нашем CUDA stream */
+    nppSetStream(s->cu_stream);
+
     for (i = 0; i < nb; i++) {
         AVFrame *src = in[i];
         int cx = s->cell_px[i].x;
@@ -220,36 +225,71 @@ static int cuda_grid_compose(FFFrameSync *fs)
         int cw = s->cell_px[i].w;
         int ch = s->cell_px[i].h;
 
-        if (src->width != cw || src->height != ch) {
-            av_log(ctx, AV_LOG_ERROR,
-                   "input %d size %dx%d != cell size %dx%d "
-                   "(Phase 2a: no scaling — Phase 2b добавит NPP resize)\n",
-                   i, src->width, src->height, cw, ch);
-            CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
-            ret = AVERROR(EINVAL);
-            goto fail;
-        }
+        if (src->width == cw && src->height == ch) {
+            /* Fast path: same size — memcpy без NPP overhead */
+            ret = copy_input_plane(ctx,
+                                   (CUdeviceptr)src->data[0], src->linesize[0],
+                                   src->width, src->height,
+                                   (CUdeviceptr)out->data[0], out->linesize[0],
+                                   cx, cy, 1);
+            if (ret < 0) {
+                CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
+                goto fail;
+            }
+            ret = copy_input_plane(ctx,
+                                   (CUdeviceptr)src->data[1], src->linesize[1],
+                                   src->width / 2, src->height / 2,
+                                   (CUdeviceptr)out->data[1], out->linesize[1],
+                                   cx / 2, cy / 2, 2);
+            if (ret < 0) {
+                CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
+                goto fail;
+            }
+        } else {
+            /* Phase 2b: NPP scaling. Output обязательно chroma-aligned (cell coords
+             * выровнены до 2 в config_output). */
+            NppStatus npp_err;
+            double xfactor_y  = (double)cw       / src->width;
+            double yfactor_y  = (double)ch       / src->height;
+            double xfactor_uv = (double)(cw / 2) / (src->width  / 2);
+            double yfactor_uv = (double)(ch / 2) / (src->height / 2);
 
-        /* Y plane */
-        ret = copy_input_plane(ctx,
-                               (CUdeviceptr)src->data[0], src->linesize[0],
-                               src->width, src->height,
-                               (CUdeviceptr)out->data[0], out->linesize[0],
-                               cx, cy, 1);
-        if (ret < 0) {
-            CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
-            goto fail;
-        }
+            uint8_t *dst_y_ptr  = out->data[0] + (size_t)cy       * out->linesize[0] + cx;
+            uint8_t *dst_uv_ptr = out->data[1] + (size_t)(cy / 2) * out->linesize[1] + cx;
+            /* dst_uv X в bytes = cx (2 bytes per UV-pair × cx/2 = cx bytes) */
 
-        /* UV plane (NV12: половинное разрешение, 2 bytes per "pixel") */
-        ret = copy_input_plane(ctx,
-                               (CUdeviceptr)src->data[1], src->linesize[1],
-                               src->width / 2, src->height / 2,
-                               (CUdeviceptr)out->data[1], out->linesize[1],
-                               cx / 2, cy / 2, 2);
-        if (ret < 0) {
-            CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
-            goto fail;
+            /* Y plane — 1 channel (luma) */
+            npp_err = nppiResizeSqrPixel_8u_C1R(
+                src->data[0], (NppiSize){src->width, src->height},
+                src->linesize[0], (NppiRect){0, 0, src->width, src->height},
+                dst_y_ptr,    out->linesize[0],
+                (NppiRect){0, 0, cw, ch},
+                xfactor_y, yfactor_y, 0.0, 0.0,
+                NPPI_INTER_LINEAR);
+            if (npp_err != NPP_SUCCESS) {
+                av_log(ctx, AV_LOG_ERROR,
+                       "input %d Y plane NPP resize %dx%d→%dx%d failed: %d\n",
+                       i, src->width, src->height, cw, ch, npp_err);
+                CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
+                ret = AVERROR_EXTERNAL;
+                goto fail;
+            }
+
+            /* UV plane — 2 channels interleaved (NV12 chroma) */
+            npp_err = nppiResizeSqrPixel_8u_C2R(
+                src->data[1], (NppiSize){src->width / 2, src->height / 2},
+                src->linesize[1], (NppiRect){0, 0, src->width / 2, src->height / 2},
+                dst_uv_ptr,   out->linesize[1],
+                (NppiRect){0, 0, cw / 2, ch / 2},
+                xfactor_uv, yfactor_uv, 0.0, 0.0,
+                NPPI_INTER_LINEAR);
+            if (npp_err != NPP_SUCCESS) {
+                av_log(ctx, AV_LOG_ERROR,
+                       "input %d UV plane NPP resize failed: %d\n", i, npp_err);
+                CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
+                ret = AVERROR_EXTERNAL;
+                goto fail;
+            }
         }
     }