vf_cuda_grid: Phase 2b — delegated scaling to upstream scale_npp

После попытки in-filter NPP scaling обнаружено что nppiResize не имеет _C2R variant для NV12 UV interleaved (только C1R, C3R, C4R). Alternatives: - 2× nppiResize_8u_C1R с split/merge через intermediate buffers - custom CUDA kernel - treat UV pair как 16u (blending artifact на boundaries) Pragmatic decision: cuda_grid делает только composition (same-size memcpy), а scaling делегируется существующему scale_npp filter в filter chain: [0]scale_npp=1280:1080[s0]; [1]scale_npp=640:360[s1]; ... \ [s0][s1]...cuda_grid=layout=main_plus_preview Unix philosophy + leverages production-tested NPP code. Controller (Phase 3) auto-generates filter graph с scale_npp per input. Revert: - #include <nppi.h> - libnpp dependency в configure (cuda_grid_filter_deps="ffnvcodec") - nppiResize* calls в compose path Add: - Error message с примером scale_npp chain pattern - Doc в file header c filter graph пример Phase 2 = full deliverable (2a + 2b). Дальше Phase 3 controller.
2026-05-19 21:45:40 +01:00
parent 11f310061a
commit 178fc5bb4e
2 changed files with 46 additions and 72 deletions
@@ -3317,7 +3317,7 @@ thumbnail_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 transpose_npp_filter_deps="ffnvcodec libnpp"
 overlay_cuda_filter_deps="ffnvcodec"
 overlay_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
-cuda_grid_filter_deps="ffnvcodec libnpp"
+cuda_grid_filter_deps="ffnvcodec"
 sharpen_npp_filter_deps="ffnvcodec libnpp"

 ddagrab_filter_deps="d3d11va IDXGIOutput1 DXGI_OUTDUPL_FRAME_INFO"
@@ -4,13 +4,25 @@
 * Принимает N CUDA-frames на входе, выдаёт один composed frame с N cells
 * в layout. End-to-end CUDA (без CPU round-trip).
 *
- * Phase 2a: layout templates (single/dual_h/dual_v/quad/main_plus_preview/
+ * Phase 2: layout templates (single/dual_h/dual_v/quad/main_plus_preview/
 * six_grid/nine_grid/sixteen_grid/panoramic), dynamic nb_inputs, output size
 * через option (default 1920×1080). Cell rects = normalized × output size.
- * **Scaling пока нет** — каждый input должен быть точно cell size (Phase 2b NPP).
+ *
+ * **Scaling delegated to upstream `scale_npp`** filter (Unix philosophy +
+ * production-tested NPP code). NPP не имеет nppiResize_8u_C2R для NV12 UV
+ * interleaved, поэтому in-filter scaling = either two intermediate plane
+ * buffers либо custom CUDA kernel — оба больше work чем filter chain'ить:
+ *
+ *   ffmpeg ... -filter_complex \
+ *     "[0]scale_npp=1280:1080[s0]; \
+ *      [1]scale_npp=640:360[s1]; \
+ *      [2]scale_npp=640:360[s2]; \
+ *      [3]scale_npp=640:360[s3]; \
+ *      [s0][s1][s2][s3]cuda_grid=layout=main_plus_preview[out]"
+ *
+ * Controller (Phase 3) auto-generates filter graph с scale_npp per input.
 *
 * Future phases (см. gx/vf-cuda-grid#1):
- *  - Phase 2b: per-cell scaling через libnpp (mixed-size inputs)
 *  - Phase 3: runtime layout switching через process_command (ZMQ)
 *  - Phase 4+: overlay primitives (rect/text/icon/image/dim/graph/chat)
 *
@@ -19,8 +31,6 @@

 #include "config_components.h"

-#include <nppi.h>
-
 #include "libavutil/avstring.h"
 #include "libavutil/common.h"
 #include "libavutil/cuda_check.h"
@@ -215,9 +225,6 @@ static int cuda_grid_compose(FFFrameSync *fs)
    if (ret < 0)
        goto fail;

-    /* NPP в этом thread'е работает в нашем CUDA stream */
-    nppSetStream(s->cu_stream);
-
    for (i = 0; i < nb; i++) {
        AVFrame *src = in[i];
        int cx = s->cell_px[i].x;
@@ -225,71 +232,38 @@ static int cuda_grid_compose(FFFrameSync *fs)
        int cw = s->cell_px[i].w;
        int ch = s->cell_px[i].h;

-        if (src->width == cw && src->height == ch) {
-            /* Fast path: same size — memcpy без NPP overhead */
-            ret = copy_input_plane(ctx,
-                                   (CUdeviceptr)src->data[0], src->linesize[0],
-                                   src->width, src->height,
-                                   (CUdeviceptr)out->data[0], out->linesize[0],
-                                   cx, cy, 1);
-            if (ret < 0) {
-                CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
-                goto fail;
-            }
-            ret = copy_input_plane(ctx,
-                                   (CUdeviceptr)src->data[1], src->linesize[1],
-                                   src->width / 2, src->height / 2,
-                                   (CUdeviceptr)out->data[1], out->linesize[1],
-                                   cx / 2, cy / 2, 2);
-            if (ret < 0) {
-                CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
-                goto fail;
-            }
-        } else {
-            /* Phase 2b: NPP scaling. Output обязательно chroma-aligned (cell coords
-             * выровнены до 2 в config_output). */
-            NppStatus npp_err;
-            double xfactor_y  = (double)cw       / src->width;
-            double yfactor_y  = (double)ch       / src->height;
-            double xfactor_uv = (double)(cw / 2) / (src->width  / 2);
-            double yfactor_uv = (double)(ch / 2) / (src->height / 2);
+        if (src->width != cw || src->height != ch) {
+            av_log(ctx, AV_LOG_ERROR,
+                   "input %d size %dx%d != cell size %dx%d. "
+                   "cuda_grid не делает scaling — используй upstream scale_npp:\n"
+                   "  [in%d]scale_npp=%d:%d[scaled%d]; [scaled%d]...cuda_grid=layout=%s\n",
+                   i, src->width, src->height, cw, ch,
+                   i, cw, ch, i, i, s->layout->name);
+            CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
+            ret = AVERROR(EINVAL);
+            goto fail;
+        }

-            uint8_t *dst_y_ptr  = out->data[0] + (size_t)cy       * out->linesize[0] + cx;
-            uint8_t *dst_uv_ptr = out->data[1] + (size_t)(cy / 2) * out->linesize[1] + cx;
-            /* dst_uv X в bytes = cx (2 bytes per UV-pair × cx/2 = cx bytes) */
+        /* Y plane (1 byte per pixel) */
+        ret = copy_input_plane(ctx,
+                               (CUdeviceptr)src->data[0], src->linesize[0],
+                               src->width, src->height,
+                               (CUdeviceptr)out->data[0], out->linesize[0],
+                               cx, cy, 1);
+        if (ret < 0) {
+            CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
+            goto fail;
+        }

-            /* Y plane — 1 channel (luma) */
-            npp_err = nppiResizeSqrPixel_8u_C1R(
-                src->data[0], (NppiSize){src->width, src->height},
-                src->linesize[0], (NppiRect){0, 0, src->width, src->height},
-                dst_y_ptr,    out->linesize[0],
-                (NppiRect){0, 0, cw, ch},
-                xfactor_y, yfactor_y, 0.0, 0.0,
-                NPPI_INTER_LINEAR);
-            if (npp_err != NPP_SUCCESS) {
-                av_log(ctx, AV_LOG_ERROR,
-                       "input %d Y plane NPP resize %dx%d→%dx%d failed: %d\n",
-                       i, src->width, src->height, cw, ch, npp_err);
-                CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
-                ret = AVERROR_EXTERNAL;
-                goto fail;
-            }
-
-            /* UV plane — 2 channels interleaved (NV12 chroma) */
-            npp_err = nppiResizeSqrPixel_8u_C2R(
-                src->data[1], (NppiSize){src->width / 2, src->height / 2},
-                src->linesize[1], (NppiRect){0, 0, src->width / 2, src->height / 2},
-                dst_uv_ptr,   out->linesize[1],
-                (NppiRect){0, 0, cw / 2, ch / 2},
-                xfactor_uv, yfactor_uv, 0.0, 0.0,
-                NPPI_INTER_LINEAR);
-            if (npp_err != NPP_SUCCESS) {
-                av_log(ctx, AV_LOG_ERROR,
-                       "input %d UV plane NPP resize failed: %d\n", i, npp_err);
-                CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
-                ret = AVERROR_EXTERNAL;
-                goto fail;
-            }
+        /* UV plane (NV12: half resolution, 2 bytes per "pixel") */
+        ret = copy_input_plane(ctx,
+                               (CUdeviceptr)src->data[1], src->linesize[1],
+                               src->width / 2, src->height / 2,
+                               (CUdeviceptr)out->data[1], out->linesize[1],
+                               cx / 2, cy / 2, 2);
+        if (ret < 0) {
+            CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
+            goto fail;
        }
    }