diff --git a/configure b/configure index c24aa94..b94029e 100755 --- a/configure +++ b/configure @@ -3317,6 +3317,8 @@ thumbnail_cuda_filter_deps_any="cuda_nvcc cuda_llvm" transpose_npp_filter_deps="ffnvcodec libnpp" overlay_cuda_filter_deps="ffnvcodec" overlay_cuda_filter_deps_any="cuda_nvcc cuda_llvm" +cuda_grid_filter_deps="ffnvcodec" +cuda_grid_filter_deps_any="cuda_nvcc cuda_llvm" sharpen_npp_filter_deps="ffnvcodec libnpp" ddagrab_filter_deps="d3d11va IDXGIOutput1 DXGI_OUTDUPL_FRAME_INFO" diff --git a/libavfilter/Makefile b/libavfilter/Makefile index 91487af..f570821 100644 --- a/libavfilter/Makefile +++ b/libavfilter/Makefile @@ -410,6 +410,8 @@ OBJS-$(CONFIG_OSCILLOSCOPE_FILTER) += vf_datascope.o OBJS-$(CONFIG_OVERLAY_FILTER) += vf_overlay.o framesync.o OBJS-$(CONFIG_OVERLAY_CUDA_FILTER) += vf_overlay_cuda.o framesync.o vf_overlay_cuda.ptx.o \ cuda/load_helper.o +OBJS-$(CONFIG_CUDA_GRID_FILTER) += vf_cuda_grid.o framesync.o \ + vf_cuda_grid.ptx.o cuda/load_helper.o OBJS-$(CONFIG_OVERLAY_OPENCL_FILTER) += vf_overlay_opencl.o opencl.o \ opencl/overlay.o framesync.o OBJS-$(CONFIG_OVERLAY_QSV_FILTER) += vf_overlay_qsv.o framesync.o diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c index 9819f0f..bc0d00a 100644 --- a/libavfilter/allfilters.c +++ b/libavfilter/allfilters.c @@ -390,6 +390,7 @@ extern const AVFilter ff_vf_overlay_qsv; extern const AVFilter ff_vf_overlay_vaapi; extern const AVFilter ff_vf_overlay_vulkan; extern const AVFilter ff_vf_overlay_cuda; +extern const AVFilter ff_vf_cuda_grid; extern const AVFilter ff_vf_owdenoise; extern const AVFilter ff_vf_pad; extern const AVFilter ff_vf_pad_opencl; diff --git a/libavfilter/vf_cuda_grid.c b/libavfilter/vf_cuda_grid.c new file mode 100644 index 0000000..6367239 --- /dev/null +++ b/libavfilter/vf_cuda_grid.c @@ -0,0 +1,1679 @@ +/* + * cuda_grid — GPU-native video grid composer для FFmpeg 7.x. + * + * Принимает N CUDA-frames на входе, выдаёт один composed frame с N cells + * в layout. End-to-end CUDA (без CPU round-trip). + * + * Phase 2: layout templates (single/dual_h/dual_v/quad/main_plus_preview/ + * six_grid/nine_grid/sixteen_grid/panoramic), dynamic nb_inputs, output size + * через option (default 1920×1080). Cell rects = normalized × output size. + * + * **Scaling delegated to upstream `scale_npp`** filter (Unix philosophy + + * production-tested NPP code). NPP не имеет nppiResize_8u_C2R для NV12 UV + * interleaved, поэтому in-filter scaling = either two intermediate plane + * buffers либо custom CUDA kernel — оба больше work чем filter chain'ить: + * + * ffmpeg ... -filter_complex \ + * "[0]scale_npp=1280:1080[s0]; \ + * [1]scale_npp=640:360[s1]; \ + * [2]scale_npp=640:360[s2]; \ + * [3]scale_npp=640:360[s3]; \ + * [s0][s1][s2][s3]cuda_grid=layout=main_plus_preview[out]" + * + * Controller (Phase 3) auto-generates filter graph с scale_npp per input. + * + * Future phases (см. gx/vf-cuda-grid#1): + * - Phase 3: runtime layout switching через process_command (ZMQ) + * - Phase 4+: overlay primitives (rect/text/icon/image/dim/graph/chat) + * + * Лицензия: LGPL-2.1+ (соответствует FFmpeg) + */ + +#include "config_components.h" + +#include + +#include "libavutil/avstring.h" +#include "libavutil/common.h" +#include "libavutil/cuda_check.h" +#include "libavutil/hwcontext.h" +#include "libavutil/hwcontext_cuda_internal.h" +#include "libavutil/log.h" +#include "libavutil/mem.h" +#include "libavutil/opt.h" +#include "libavutil/pixdesc.h" +#include "libavutil/thread.h" + +#include "config.h" +#if CONFIG_LIBFREETYPE +#include +#include FT_FREETYPE_H +#endif + +#include "libavcodec/avcodec.h" +#include "libavformat/avformat.h" +#include "libswscale/swscale.h" + +#include "avfilter.h" +#include "cuda/load_helper.h" +#include "filters.h" +#include "formats.h" +#include "framesync.h" +#include "video.h" + +extern const unsigned char ff_vf_cuda_grid_ptx_data[]; +extern const unsigned int ff_vf_cuda_grid_ptx_len; + +#define BLOCKX 16 +#define BLOCKY 16 +#define DIV_UP(a, b) (((a) + (b) - 1) / (b)) + +#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x) +#define MAX_CELLS 16 +#define MAX_OVERLAYS 64 +#define OVERLAY_ID_MAX 32 +#define OVERLAY_TEXT_MAX 128 + +/* ─── Overlay primitives (Phase 4b-1: rect only, no alpha) ─────────────── */ + +typedef enum { + OV_TYPE_RECT = 0, + OV_TYPE_TEXT, /* Phase 4b-3 */ + OV_TYPE_ICON, /* Phase 4b-4 */ + OV_TYPE_DIM, /* Phase 4b-2 (alpha kernel) */ +} GridOverlayType; + +typedef struct GridOverlay { + char id[OVERLAY_ID_MAX]; + GridOverlayType type; + int cell; /* -1 = absolute on output, иначе cell-relative */ + float x, y, w, h; /* [0..1] normalized; relative к cell или output */ + int z_order; /* меньше → рисуется первым */ + uint8_t opacity; /* 0..255, реально применяется в Phase 4b-2 */ + int visible; + union { + struct { + uint8_t r, g, b; + int thickness; /* 0 = filled, >0 = border (border via 4 strips) */ + } rect; + struct { + char text[OVERLAY_TEXT_MAX]; + int font_size; + uint8_t r, g, b; + } text; + struct { + char icon_name[32]; + } icon; + struct { + uint8_t amount; /* 0..255, alpha for dim */ + } dim; + } u; +} GridOverlay; + +/* ─── Layout templates (normalized координаты 0.0–1.0) ─────────────────── */ + +typedef struct LayoutCell { + float x, y, w, h; /* normalized fraction of output size */ +} LayoutCell; + +typedef struct LayoutTemplate { + const char *name; + int nb_cells; + LayoutCell cells[MAX_CELLS]; +} LayoutTemplate; + +/* Layouts copy-paste'нуты по структуре cctv-processor config/grids.json. */ +static const LayoutTemplate layouts[] = { + { + "single", 1, + { {0.0f, 0.0f, 1.0f, 1.0f} } + }, + { + "dual_horizontal", 2, + { {0.0f, 0.0f, 0.5f, 1.0f}, {0.5f, 0.0f, 0.5f, 1.0f} } + }, + { + "dual_vertical", 2, + { {0.0f, 0.0f, 1.0f, 0.5f}, {0.0f, 0.5f, 1.0f, 0.5f} } + }, + { + "quad", 4, + { + {0.0f, 0.0f, 0.5f, 0.5f}, {0.5f, 0.0f, 0.5f, 0.5f}, + {0.0f, 0.5f, 0.5f, 0.5f}, {0.5f, 0.5f, 0.5f, 0.5f}, + } + }, + { + /* Main camera 2/3 width, 3 small cameras stacked справа сверху вниз */ + "main_plus_preview", 4, + { + {0.0f, 0.0f, 2.0f/3, 1.0f}, + {2.0f/3, 0.0f, 1.0f/3, 1.0f/3}, + {2.0f/3, 1.0f/3, 1.0f/3, 1.0f/3}, + {2.0f/3, 2.0f/3, 1.0f/3, 1.0f/3}, + } + }, + { + "six_grid", 6, + { + {0.0f, 0.0f, 1.0f/3, 0.5f}, {1.0f/3, 0.0f, 1.0f/3, 0.5f}, {2.0f/3, 0.0f, 1.0f/3, 0.5f}, + {0.0f, 0.5f, 1.0f/3, 0.5f}, {1.0f/3, 0.5f, 1.0f/3, 0.5f}, {2.0f/3, 0.5f, 1.0f/3, 0.5f}, + } + }, + { + "nine_grid", 9, + { + {0.0f, 0.0f, 1.0f/3, 1.0f/3}, {1.0f/3, 0.0f, 1.0f/3, 1.0f/3}, {2.0f/3, 0.0f, 1.0f/3, 1.0f/3}, + {0.0f, 1.0f/3, 1.0f/3, 1.0f/3}, {1.0f/3, 1.0f/3, 1.0f/3, 1.0f/3}, {2.0f/3, 1.0f/3, 1.0f/3, 1.0f/3}, + {0.0f, 2.0f/3, 1.0f/3, 1.0f/3}, {1.0f/3, 2.0f/3, 1.0f/3, 1.0f/3}, {2.0f/3, 2.0f/3, 1.0f/3, 1.0f/3}, + } + }, + { + "sixteen_grid", 16, + { + {0.00f, 0.00f, 0.25f, 0.25f}, {0.25f, 0.00f, 0.25f, 0.25f}, {0.50f, 0.00f, 0.25f, 0.25f}, {0.75f, 0.00f, 0.25f, 0.25f}, + {0.00f, 0.25f, 0.25f, 0.25f}, {0.25f, 0.25f, 0.25f, 0.25f}, {0.50f, 0.25f, 0.25f, 0.25f}, {0.75f, 0.25f, 0.25f, 0.25f}, + {0.00f, 0.50f, 0.25f, 0.25f}, {0.25f, 0.50f, 0.25f, 0.25f}, {0.50f, 0.50f, 0.25f, 0.25f}, {0.75f, 0.50f, 0.25f, 0.25f}, + {0.00f, 0.75f, 0.25f, 0.25f}, {0.25f, 0.75f, 0.25f, 0.25f}, {0.50f, 0.75f, 0.25f, 0.25f}, {0.75f, 0.75f, 0.25f, 0.25f}, + } + }, + { + /* Один widescreen panoramic — 1 cell full width */ + "panoramic", 1, + { {0.0f, 0.0f, 1.0f, 1.0f} } + }, +}; + +static const LayoutTemplate *find_layout(const char *name) +{ + for (size_t i = 0; i < FF_ARRAY_ELEMS(layouts); i++) { + if (!strcmp(layouts[i].name, name)) + return &layouts[i]; + } + return NULL; +} + +/* ─── Filter state ─────────────────────────────────────────────────────── */ + +/* Icon atlas — RGBA buffer на GPU, keyed by icon_name (Phase 4b-4, shared across overlays). */ +#define OVERLAY_ICON_NAME_MAX 32 +#define MAX_ICON_ATLASES 32 + +typedef struct IconAtlas { + char icon_name[OVERLAY_ICON_NAME_MAX]; + CUdeviceptr device_ptr; + size_t device_pitch; + int w, h; +} IconAtlas; + +/* Text atlas — rasterized RGBA buffer on GPU, keyed by overlay id (Phase 4b-3). */ +typedef struct TextAtlas { + char id[OVERLAY_ID_MAX]; + char cached_text[OVERLAY_TEXT_MAX]; + int cached_font_size; + uint8_t cached_r, cached_g, cached_b; + CUdeviceptr device_ptr; + size_t device_pitch; + int w, h; +} TextAtlas; + +typedef struct CudaGridContext { + const AVClass *class; + + /* Options */ + char *layout_name; + int out_width; + int out_height; + char *font_file; + int default_font_size; + char *icon_dir; + + /* Resolved layout (после init) */ + const LayoutTemplate *layout; + + /* CUDA */ + AVCUDADeviceContext *hwctx; + CUcontext cu_ctx; + CUstream cu_stream; + + FFFrameSync fs; + + /* Per-cell pixel rects (computed в config_output из normalized × out size) */ + struct { + int x, y, w, h; + } cell_px[MAX_CELLS]; + + /* Overlay state (Phase 4b) — mutex-guarded, process_command/render thread-safe */ + GridOverlay overlays[MAX_OVERLAYS]; + int nb_overlays; + pthread_mutex_t overlay_lock; + int overlay_lock_inited; + + /* CUDA kernels (Phase 4b-2) — loaded from embedded .ptx */ + CUmodule cu_module; + CUfunction cu_func_alpha_fill_y; + CUfunction cu_func_alpha_fill_uv; + CUfunction cu_func_alpha_blit_rgba_y; + CUfunction cu_func_alpha_blit_rgba_uv; + + /* Text rendering (Phase 4b-3, libfreetype optional) */ +#if CONFIG_LIBFREETYPE + FT_Library ft_lib; + FT_Face ft_face; + int ft_ready; /* 1 once library + face loaded */ + int ft_init_attempted; /* set on first attempt, success or fail */ +#endif + TextAtlas atlases[MAX_OVERLAYS]; + int nb_atlases; + + /* Icon atlas cache (Phase 4b-4) */ + IconAtlas icon_atlases[MAX_ICON_ATLASES]; + int nb_icon_atlases; +} CudaGridContext; + +/* ─── Composition: copy одного input plane в target region output ──────── */ + +static int copy_input_plane(AVFilterContext *ctx, + CUdeviceptr src_data, + int src_pitch, + int src_w, + int src_h, + CUdeviceptr dst_data, + int dst_pitch, + int dst_x, + int dst_y, + int bytes_per_pixel) +{ + CudaGridContext *s = ctx->priv; + CUDA_MEMCPY2D cpy = { + .srcMemoryType = CU_MEMORYTYPE_DEVICE, + .srcDevice = src_data, + .srcPitch = src_pitch, + .dstMemoryType = CU_MEMORYTYPE_DEVICE, + .dstDevice = dst_data, + .dstXInBytes = (size_t)dst_x * bytes_per_pixel, + .dstY = dst_y, + .dstPitch = dst_pitch, + .WidthInBytes = (size_t)src_w * bytes_per_pixel, + .Height = src_h, + }; + return CHECK_CU(s->hwctx->internal->cuda_dl->cuMemcpy2DAsync(&cpy, s->cu_stream)); +} + +/* ─── Overlay parsing + rendering (Phase 4b-1: rect, solid fill, no alpha) ─ */ + +/* BT.709 limited-range RGB → YUV. Достаточно для HDTV (любое 1920×1080 output). */ +static av_always_inline void rgb_to_yuv709(uint8_t r, uint8_t g, uint8_t b, + uint8_t *Y, uint8_t *U, uint8_t *V) +{ + int y = (int)( 0.183f * r + 0.614f * g + 0.062f * b) + 16; + int u = (int)(-0.101f * r - 0.339f * g + 0.439f * b) + 128; + int v = (int)( 0.439f * r - 0.399f * g - 0.040f * b) + 128; + *Y = av_clip_uint8(y); + *U = av_clip_uint8(u); + *V = av_clip_uint8(v); +} + +/* In-place URL-decode (%20 etc.) — for string values приходящих от controller. */ +static void url_decode_inplace(char *s) +{ + char *r = s, *w = s; + while (*r) { + if (r[0] == '%' && r[1] && r[2]) { + int hi = -1, lo = -1; + char c1 = r[1], c2 = r[2]; + if (c1 >= '0' && c1 <= '9') hi = c1 - '0'; + else if (c1 >= 'A' && c1 <= 'F') hi = c1 - 'A' + 10; + else if (c1 >= 'a' && c1 <= 'f') hi = c1 - 'a' + 10; + if (c2 >= '0' && c2 <= '9') lo = c2 - '0'; + else if (c2 >= 'A' && c2 <= 'F') lo = c2 - 'A' + 10; + else if (c2 >= 'a' && c2 <= 'f') lo = c2 - 'a' + 10; + if (hi >= 0 && lo >= 0) { + *w++ = (char)((hi << 4) | lo); + r += 3; + continue; + } + } + *w++ = *r++; + } + *w = '\0'; +} + +/* Parse args в формате: " = = ..." + * String values приходят URL-encoded (%20 = space) от controller. + * Out *ov заполняется defaults + parsed values. Returns 0 / AVERROR. */ +static int parse_overlay_args(AVFilterContext *ctx, const char *args, GridOverlay *ov) +{ + char id[OVERLAY_ID_MAX], type_str[16]; + const char *p; + + if (!args) + return AVERROR(EINVAL); + + if (sscanf(args, "%31s %15s", id, type_str) != 2) { + av_log(ctx, AV_LOG_ERROR, "overlay args: expected ' ...', got: %s\n", args); + return AVERROR(EINVAL); + } + + memset(ov, 0, sizeof(*ov)); + av_strlcpy(ov->id, id, sizeof(ov->id)); + + if (!strcmp(type_str, "rect")) ov->type = OV_TYPE_RECT; + else if (!strcmp(type_str, "text")) ov->type = OV_TYPE_TEXT; + else if (!strcmp(type_str, "icon")) ov->type = OV_TYPE_ICON; + else if (!strcmp(type_str, "dim")) ov->type = OV_TYPE_DIM; + else { + av_log(ctx, AV_LOG_ERROR, "unknown overlay type: %s\n", type_str); + return AVERROR(EINVAL); + } + + /* Defaults */ + ov->cell = -1; + ov->opacity = 255; + ov->visible = 1; + if (ov->type == OV_TYPE_RECT) + ov->u.rect.thickness = 0; /* filled */ + if (ov->type == OV_TYPE_DIM) + ov->u.dim.amount = 128; + + /* Advance past и */ + p = strchr(args, ' '); + if (!p) return 0; + while (*p == ' ') p++; + p = strchr(p, ' '); + if (!p) return 0; + while (*p == ' ') p++; + + while (*p) { + char key[32], val[OVERLAY_TEXT_MAX]; + int n = 0; + if (sscanf(p, "%31[^= \t]=%127s%n", key, val, &n) < 2) + break; + p += n; + while (*p == ' ' || *p == '\t') p++; + + if (!strcmp(key, "cell")) ov->cell = atoi(val); + else if (!strcmp(key, "x")) ov->x = (float)atof(val); + else if (!strcmp(key, "y")) ov->y = (float)atof(val); + else if (!strcmp(key, "w")) ov->w = (float)atof(val); + else if (!strcmp(key, "h")) ov->h = (float)atof(val); + else if (!strcmp(key, "z_order") || !strcmp(key, "z")) + ov->z_order = atoi(val); + else if (!strcmp(key, "opacity")) ov->opacity = av_clip(atoi(val), 0, 255); + else if (!strcmp(key, "visible")) ov->visible = atoi(val) ? 1 : 0; + else if (ov->type == OV_TYPE_RECT) { + if (!strcmp(key, "r")) ov->u.rect.r = av_clip_uint8(atoi(val)); + else if (!strcmp(key, "g")) ov->u.rect.g = av_clip_uint8(atoi(val)); + else if (!strcmp(key, "b")) ov->u.rect.b = av_clip_uint8(atoi(val)); + else if (!strcmp(key, "thickness")) ov->u.rect.thickness = atoi(val); + } else if (ov->type == OV_TYPE_TEXT) { + if (!strcmp(key, "text")) { + av_strlcpy(ov->u.text.text, val, sizeof(ov->u.text.text)); + url_decode_inplace(ov->u.text.text); + } + else if (!strcmp(key, "font_size")) ov->u.text.font_size = atoi(val); + else if (!strcmp(key, "r")) ov->u.text.r = av_clip_uint8(atoi(val)); + else if (!strcmp(key, "g")) ov->u.text.g = av_clip_uint8(atoi(val)); + else if (!strcmp(key, "b")) ov->u.text.b = av_clip_uint8(atoi(val)); + } else if (ov->type == OV_TYPE_ICON) { + if (!strcmp(key, "icon_name")) { + av_strlcpy(ov->u.icon.icon_name, val, sizeof(ov->u.icon.icon_name)); + url_decode_inplace(ov->u.icon.icon_name); + } + } else if (ov->type == OV_TYPE_DIM) { + if (!strcmp(key, "amount")) ov->u.dim.amount = av_clip_uint8(atoi(val)); + } + } + return 0; +} + +/* Lock must be held by caller. */ +static int overlay_upsert_locked(CudaGridContext *s, const GridOverlay *ov) +{ + int i; + for (i = 0; i < s->nb_overlays; i++) { + if (!strcmp(s->overlays[i].id, ov->id)) { + s->overlays[i] = *ov; + return 0; + } + } + if (s->nb_overlays >= MAX_OVERLAYS) + return AVERROR(ENOSPC); + s->overlays[s->nb_overlays++] = *ov; + return 0; +} + +static int overlay_remove_locked(CudaGridContext *s, const char *id) +{ + int i; + for (i = 0; i < s->nb_overlays; i++) { + if (!strcmp(s->overlays[i].id, id)) { + memmove(&s->overlays[i], &s->overlays[i + 1], + (s->nb_overlays - i - 1) * sizeof(GridOverlay)); + s->nb_overlays--; + return 0; + } + } + return AVERROR(ENOENT); +} + +/* Compute pixel rect для overlay: либо cell-relative, либо absolute. + * Clips против output bounds + 2px alignment для NV12 chroma. */ +static int overlay_pixel_rect(CudaGridContext *s, const GridOverlay *ov, + int *out_x, int *out_y, int *out_w, int *out_h) +{ + int rx, ry, rw, rh; + int base_x, base_y, base_w, base_h; + + if (ov->cell < 0) { + base_x = 0; + base_y = 0; + base_w = s->out_width; + base_h = s->out_height; + } else if (ov->cell < s->layout->nb_cells) { + base_x = s->cell_px[ov->cell].x; + base_y = s->cell_px[ov->cell].y; + base_w = s->cell_px[ov->cell].w; + base_h = s->cell_px[ov->cell].h; + } else { + /* Cell index out of range — silently skip (overlay broadcast'нет ко всем + * cuda_grid instances, layouts с меньшим nb_cells). NOT error чтобы + * не валить filter chain. */ + return 1; + } + + rx = base_x + (int)(ov->x * base_w); + ry = base_y + (int)(ov->y * base_h); + rw = (int)(ov->w * base_w); + rh = (int)(ov->h * base_h); + + if (rx < 0) { rw += rx; rx = 0; } + if (ry < 0) { rh += ry; ry = 0; } + if (rx + rw > s->out_width) rw = s->out_width - rx; + if (ry + rh > s->out_height) rh = s->out_height - ry; + rx &= ~1; ry &= ~1; rw &= ~1; rh &= ~1; + + *out_x = rx; *out_y = ry; *out_w = rw; *out_h = rh; + return (rw > 0 && rh > 0) ? 0 : 1; /* 1 → empty, skip */ +} + +/* Strip α-blend (opacity 0..255). cu ctx must be pushed. + * opacity=255 → solid fill (kernel uses single-pass blend formula). */ +static int render_strip_alpha(AVFilterContext *ctx, AVFrame *out, + int x, int y, int w, int h, + uint8_t Y, uint8_t U, uint8_t V, uint8_t alpha) +{ + CudaGridContext *s = ctx->priv; + CUdeviceptr dst_y, dst_uv; + int ret; + int iY = Y, iU = U, iV = V, iA = alpha; + int rx, ry, rw, rh; + + x &= ~1; y &= ~1; w &= ~1; h &= ~1; + if (w <= 0 || h <= 0 || alpha == 0) return 0; + + /* Y plane */ + dst_y = (CUdeviceptr)out->data[0]; + rx = x; ry = y; rw = w; rh = h; + { + int dst_pitch_y = out->linesize[0]; + void *args[] = { &dst_y, &dst_pitch_y, &rx, &ry, &rw, &rh, &iY, &iA }; + ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuLaunchKernel( + s->cu_func_alpha_fill_y, + DIV_UP(w, BLOCKX), DIV_UP(h, BLOCKY), 1, + BLOCKX, BLOCKY, 1, + 0, s->cu_stream, args, NULL)); + if (ret < 0) return ret; + } + + /* UV plane (half res, interleaved 2 bytes per chroma sample) */ + dst_uv = (CUdeviceptr)out->data[1]; + rx = x / 2; ry = y / 2; rw = w / 2; rh = h / 2; + { + int dst_pitch_uv = out->linesize[1]; + void *args[] = { &dst_uv, &dst_pitch_uv, &rx, &ry, &rw, &rh, &iU, &iV, &iA }; + ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuLaunchKernel( + s->cu_func_alpha_fill_uv, + DIV_UP(w / 2, BLOCKX), DIV_UP(h / 2, BLOCKY), 1, + BLOCKX, BLOCKY, 1, + 0, s->cu_stream, args, NULL)); + if (ret < 0) return ret; + } + return 0; +} + +/* ─── Text rendering (Phase 4b-3, freetype) ────────────────────────────── */ + +#if CONFIG_LIBFREETYPE + +static const char *const default_fonts[] = { + "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", + "/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf", + "/usr/share/fonts/TTF/DejaVuSans-Bold.ttf", + "/usr/share/fonts/dejavu/DejaVuSans-Bold.ttf", + NULL, +}; + +static int ensure_ft_loaded(AVFilterContext *ctx) +{ + CudaGridContext *s = ctx->priv; + const char *path = s->font_file; + FT_Error err; + int i; + + if (s->ft_ready) return 0; + if (s->ft_init_attempted) return AVERROR(ENOSYS); + s->ft_init_attempted = 1; + + err = FT_Init_FreeType(&s->ft_lib); + if (err) { + av_log(ctx, AV_LOG_WARNING, "FT_Init_FreeType failed (err=%d), text overlays disabled\n", err); + return AVERROR(ENOSYS); + } + + if (!path) { + for (i = 0; default_fonts[i]; i++) { + if (access(default_fonts[i], R_OK) == 0) { + path = default_fonts[i]; + break; + } + } + } + if (!path) { + av_log(ctx, AV_LOG_WARNING, "no font found (set font_file= option), text disabled\n"); + FT_Done_FreeType(s->ft_lib); + s->ft_lib = NULL; + return AVERROR(ENOENT); + } + + err = FT_New_Face(s->ft_lib, path, 0, &s->ft_face); + if (err) { + av_log(ctx, AV_LOG_WARNING, "FT_New_Face(%s) failed: %d, text disabled\n", path, err); + FT_Done_FreeType(s->ft_lib); + s->ft_lib = NULL; + return AVERROR(EIO); + } + + av_log(ctx, AV_LOG_INFO, "freetype: loaded %s\n", path); + s->ft_ready = 1; + return 0; +} + +/* Two-pass rasterize: 1) measure total_w / ascent / descent, 2) blit to RGBA buffer. + * Returns malloc'd RGBA buffer (caller frees) + dims via out params, or NULL. */ +static uint8_t *rasterize_text_rgba(AVFilterContext *ctx, const char *text, int font_size, + uint8_t r, uint8_t g, uint8_t b, + int *out_w, int *out_h) +{ + CudaGridContext *s = ctx->priv; + FT_Error err; + const unsigned char *p; + int total_w = 0, ascent_px = 0, descent_px = 0; + int atlas_w, atlas_h, pen_x, baseline_y; + uint8_t *atlas; + + if (!s->ft_ready) return NULL; + err = FT_Set_Pixel_Sizes(s->ft_face, 0, font_size); + if (err) { + av_log(ctx, AV_LOG_WARNING, "FT_Set_Pixel_Sizes failed: %d\n", err); + return NULL; + } + + /* Pass 1: measure */ + for (p = (const unsigned char *)text; *p; p++) { + FT_UInt gi; + FT_GlyphSlot g_slot; + gi = FT_Get_Char_Index(s->ft_face, *p); + if (!gi) continue; + if (FT_Load_Glyph(s->ft_face, gi, FT_LOAD_DEFAULT)) continue; + g_slot = s->ft_face->glyph; + total_w += (g_slot->advance.x >> 6); + ascent_px = FFMAX(ascent_px, g_slot->bitmap_top); + descent_px = FFMAX(descent_px, (int)g_slot->bitmap.rows - g_slot->bitmap_top); + } + if (total_w <= 0) return NULL; + + atlas_w = total_w + 4; /* padding */ + atlas_h = ascent_px + descent_px + 4; + if (atlas_h <= 0) atlas_h = font_size; + + atlas = av_mallocz((size_t)atlas_w * atlas_h * 4); + if (!atlas) return NULL; + + /* Pass 2: blit */ + pen_x = 2; + baseline_y = ascent_px + 2; + for (p = (const unsigned char *)text; *p; p++) { + FT_UInt gi; + FT_GlyphSlot g_slot; + int gx, gy, bw, bh; + int yy, xx; + + gi = FT_Get_Char_Index(s->ft_face, *p); + if (!gi) continue; + if (FT_Load_Glyph(s->ft_face, gi, FT_LOAD_RENDER)) continue; + g_slot = s->ft_face->glyph; + if (g_slot->bitmap.pixel_mode != FT_PIXEL_MODE_GRAY) { + pen_x += g_slot->advance.x >> 6; + continue; + } + bw = g_slot->bitmap.width; + bh = g_slot->bitmap.rows; + gx = pen_x + g_slot->bitmap_left; + gy = baseline_y - g_slot->bitmap_top; + + for (yy = 0; yy < bh; yy++) { + int dy = gy + yy; + if (dy < 0 || dy >= atlas_h) continue; + for (xx = 0; xx < bw; xx++) { + int dx = gx + xx; + uint8_t gray; + uint8_t *dst; + if (dx < 0 || dx >= atlas_w) continue; + gray = g_slot->bitmap.buffer[yy * g_slot->bitmap.pitch + xx]; + if (gray == 0) continue; + dst = atlas + ((size_t)dy * atlas_w + dx) * 4; + /* Premultiplied: max() — text "fills" rather than blending overlaps */ + if (gray > dst[3]) { + dst[0] = r; + dst[1] = g; + dst[2] = b; + dst[3] = gray; + } + } + } + pen_x += g_slot->advance.x >> 6; + } + + *out_w = atlas_w; + *out_h = atlas_h; + return atlas; +} + +#endif /* CONFIG_LIBFREETYPE */ + +/* Cache lookup — locked by caller. */ +static TextAtlas *atlas_find_locked(CudaGridContext *s, const char *id) +{ + int i; + for (i = 0; i < s->nb_atlases; i++) + if (!strcmp(s->atlases[i].id, id)) + return &s->atlases[i]; + return NULL; +} + +static void atlas_free_gpu_locked(CudaGridContext *s, TextAtlas *a) +{ + if (a->device_ptr) { + s->hwctx->internal->cuda_dl->cuMemFree(a->device_ptr); + a->device_ptr = 0; + } +} + +static void atlas_remove_locked(CudaGridContext *s, const char *id) +{ + int i; + for (i = 0; i < s->nb_atlases; i++) { + if (!strcmp(s->atlases[i].id, id)) { + atlas_free_gpu_locked(s, &s->atlases[i]); + memmove(&s->atlases[i], &s->atlases[i + 1], + (s->nb_atlases - i - 1) * sizeof(TextAtlas)); + s->nb_atlases--; + return; + } + } +} + +#if CONFIG_LIBFREETYPE +/* Ensure GPU atlas for given overlay matches its text/font_size/color. cu ctx pushed by caller. */ +static int ensure_text_atlas(AVFilterContext *ctx, const GridOverlay *ov, TextAtlas **out_atlas) +{ + CudaGridContext *s = ctx->priv; + TextAtlas *a; + uint8_t *cpu_atlas; + int aw, ah; + int font_size; + CUdeviceptr dev_ptr = 0; + size_t dev_pitch = 0; + CUDA_MEMCPY2D cpy = { 0 }; + int ret; + + font_size = ov->u.text.font_size > 0 ? ov->u.text.font_size : s->default_font_size; + + a = atlas_find_locked(s, ov->id); + if (a && + !strcmp(a->cached_text, ov->u.text.text) && + a->cached_font_size == font_size && + a->cached_r == ov->u.text.r && + a->cached_g == ov->u.text.g && + a->cached_b == ov->u.text.b) { + *out_atlas = a; + return 0; + } + + if (s->nb_atlases >= MAX_OVERLAYS && !a) + return AVERROR(ENOSPC); + + ret = ensure_ft_loaded(ctx); + if (ret < 0) return ret; + + cpu_atlas = rasterize_text_rgba(ctx, ov->u.text.text, font_size, + ov->u.text.r, ov->u.text.g, ov->u.text.b, + &aw, &ah); + if (!cpu_atlas) return AVERROR(EINVAL); + + /* Allocate pitched device buffer + upload */ + ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuMemAllocPitch( + &dev_ptr, &dev_pitch, (size_t)aw * 4, (size_t)ah, 4)); + if (ret < 0) { av_free(cpu_atlas); return ret; } + + cpy.srcMemoryType = CU_MEMORYTYPE_HOST; + cpy.srcHost = cpu_atlas; + cpy.srcPitch = (size_t)aw * 4; + cpy.dstMemoryType = CU_MEMORYTYPE_DEVICE; + cpy.dstDevice = dev_ptr; + cpy.dstPitch = dev_pitch; + cpy.WidthInBytes = (size_t)aw * 4; + cpy.Height = (size_t)ah; + ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuMemcpy2D(&cpy)); + av_free(cpu_atlas); + if (ret < 0) { + s->hwctx->internal->cuda_dl->cuMemFree(dev_ptr); + return ret; + } + + if (a) { + atlas_free_gpu_locked(s, a); + } else { + a = &s->atlases[s->nb_atlases++]; + memset(a, 0, sizeof(*a)); + av_strlcpy(a->id, ov->id, sizeof(a->id)); + } + av_strlcpy(a->cached_text, ov->u.text.text, sizeof(a->cached_text)); + a->cached_font_size = font_size; + a->cached_r = ov->u.text.r; + a->cached_g = ov->u.text.g; + a->cached_b = ov->u.text.b; + a->device_ptr = dev_ptr; + a->device_pitch = dev_pitch; + a->w = aw; + a->h = ah; + *out_atlas = a; + return 0; +} + +static int render_overlay_text(AVFilterContext *ctx, AVFrame *out, const GridOverlay *ov) +{ + CudaGridContext *s = ctx->priv; + TextAtlas *a = NULL; + int px, py; + int base_x, base_y, base_w, base_h; + CUdeviceptr dst_y, dst_uv; + int ret; + int aw, ah, ap, ealpha; + + if (!ov->u.text.text[0]) return 0; + ret = ensure_text_atlas(ctx, ov, &a); + if (ret < 0) { + av_log(ctx, AV_LOG_WARNING, "text overlay %s: atlas failed (ret=%d)\n", ov->id, ret); + return 0; /* non-fatal — skip */ + } + if (!a) return 0; + + /* Position: x,y normalized within cell or output (top-left corner of atlas). */ + if (ov->cell < 0) { + base_x = 0; base_y = 0; + base_w = s->out_width; base_h = s->out_height; + } else if (ov->cell < s->layout->nb_cells) { + base_x = s->cell_px[ov->cell].x; + base_y = s->cell_px[ov->cell].y; + base_w = s->cell_px[ov->cell].w; + base_h = s->cell_px[ov->cell].h; + } else { + return 0; + } + px = base_x + (int)(ov->x * base_w); + py = base_y + (int)(ov->y * base_h); + px &= ~1; py &= ~1; + /* Clip — if atlas runs past output bounds, kernels still safe (they bounds-check + * against atlas w/h, not dst). But avoid going off output: simple skip when fully outside. */ + if (px >= s->out_width || py >= s->out_height) return 0; + + aw = a->w; ah = a->h; + /* Reduce ah if it overflows output (avoid OOB on Y plane); UV plane uses h/2 so same scale. */ + if (px + aw > s->out_width) aw = s->out_width - px; + if (py + ah > s->out_height) ah = s->out_height - py; + aw &= ~1; ah &= ~1; + if (aw <= 0 || ah <= 0) return 0; + + ap = (int)a->device_pitch; + ealpha = ov->opacity; + + /* Y plane */ + dst_y = (CUdeviceptr)out->data[0]; + { + int dst_pitch_y = out->linesize[0]; + CUdeviceptr atl = a->device_ptr; + void *args[] = { &dst_y, &dst_pitch_y, &px, &py, &atl, &ap, &aw, &ah, &ealpha }; + ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuLaunchKernel( + s->cu_func_alpha_blit_rgba_y, + DIV_UP(aw, BLOCKX), DIV_UP(ah, BLOCKY), 1, + BLOCKX, BLOCKY, 1, + 0, s->cu_stream, args, NULL)); + if (ret < 0) return ret; + } + /* UV plane (kernel does internal half-res subsampling) */ + dst_uv = (CUdeviceptr)out->data[1]; + { + int dst_pitch_uv = out->linesize[1]; + CUdeviceptr atl = a->device_ptr; + void *args[] = { &dst_uv, &dst_pitch_uv, &px, &py, &atl, &ap, &aw, &ah, &ealpha }; + ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuLaunchKernel( + s->cu_func_alpha_blit_rgba_uv, + DIV_UP(aw / 2, BLOCKX), DIV_UP(ah / 2, BLOCKY), 1, + BLOCKX, BLOCKY, 1, + 0, s->cu_stream, args, NULL)); + if (ret < 0) return ret; + } + return 0; +} + +#endif /* CONFIG_LIBFREETYPE */ + +/* ─── Icon rendering (Phase 4b-4, PNG via libavcodec + swscale) ────────── */ + +static const char *const default_icon_dirs[] = { + "/var/lib/cuda-grid/icons", + "/opt/cuda-grid/icons", + NULL, +}; + +/* Read image file, decode (PNG/JPEG/etc.), convert to packed RGBA8. + * Returns malloc'd buffer (caller frees) + dims, or NULL on error. */ +static uint8_t *load_image_to_rgba_cpu(AVFilterContext *ctx, const char *path, + int *out_w, int *out_h) +{ + AVFormatContext *fmt_ctx = NULL; + AVCodecContext *cctx = NULL; + AVFrame *frame = NULL; + AVPacket *pkt = NULL; + struct SwsContext *sws = NULL; + const AVCodec *codec; + int stream_idx; + uint8_t *rgba = NULL; + int rgba_pitch; + int ret; + + if (avformat_open_input(&fmt_ctx, path, NULL, NULL) < 0) { + av_log(ctx, AV_LOG_WARNING, "icon: open %s failed\n", path); + return NULL; + } + if (avformat_find_stream_info(fmt_ctx, NULL) < 0) goto fail; + + stream_idx = av_find_best_stream(fmt_ctx, AVMEDIA_TYPE_VIDEO, -1, -1, NULL, 0); + if (stream_idx < 0) goto fail; + + codec = avcodec_find_decoder(fmt_ctx->streams[stream_idx]->codecpar->codec_id); + if (!codec) goto fail; + + cctx = avcodec_alloc_context3(codec); + if (!cctx) goto fail; + if (avcodec_parameters_to_context(cctx, fmt_ctx->streams[stream_idx]->codecpar) < 0) goto fail; + if (avcodec_open2(cctx, codec, NULL) < 0) goto fail; + + pkt = av_packet_alloc(); + frame = av_frame_alloc(); + if (!pkt || !frame) goto fail; + + if (av_read_frame(fmt_ctx, pkt) < 0) goto fail; + if (avcodec_send_packet(cctx, pkt) < 0) goto fail; + ret = avcodec_receive_frame(cctx, frame); + if (ret < 0) goto fail; + + sws = sws_getContext(frame->width, frame->height, frame->format, + frame->width, frame->height, AV_PIX_FMT_RGBA, + SWS_BILINEAR, NULL, NULL, NULL); + if (!sws) goto fail; + + rgba_pitch = frame->width * 4; + rgba = av_malloc((size_t)rgba_pitch * frame->height); + if (!rgba) goto fail; + + { + uint8_t *out_data[4] = { rgba, NULL, NULL, NULL }; + int out_pitch[4] = { rgba_pitch, 0, 0, 0 }; + sws_scale(sws, (const uint8_t * const *)frame->data, frame->linesize, + 0, frame->height, out_data, out_pitch); + } + + *out_w = frame->width; + *out_h = frame->height; + +fail: + if (sws) sws_freeContext(sws); + if (frame) av_frame_free(&frame); + if (pkt) av_packet_free(&pkt); + if (cctx) avcodec_free_context(&cctx); + if (fmt_ctx) avformat_close_input(&fmt_ctx); + return rgba; +} + +/* Resolve icon name к full path (search dirs + extensions). */ +static int resolve_icon_path(AVFilterContext *ctx, const char *icon_name, + char *out_path, size_t out_path_sz) +{ + CudaGridContext *s = ctx->priv; + static const char *const exts[] = { "", ".png", ".jpg", NULL }; + const char *dirs[8] = { 0 }; + int i, j, ndir = 0; + + /* Если absolute path — use as-is */ + if (icon_name[0] == '/') { + if (access(icon_name, R_OK) == 0) { + av_strlcpy(out_path, icon_name, out_path_sz); + return 0; + } + return AVERROR(ENOENT); + } + + if (s->icon_dir) dirs[ndir++] = s->icon_dir; + for (i = 0; default_icon_dirs[i] && ndir < 7; i++) dirs[ndir++] = default_icon_dirs[i]; + + for (i = 0; i < ndir; i++) { + for (j = 0; exts[j]; j++) { + snprintf(out_path, out_path_sz, "%s/%s%s", dirs[i], icon_name, exts[j]); + if (access(out_path, R_OK) == 0) return 0; + } + } + return AVERROR(ENOENT); +} + +static IconAtlas *icon_atlas_find_locked(CudaGridContext *s, const char *name) +{ + int i; + for (i = 0; i < s->nb_icon_atlases; i++) + if (!strcmp(s->icon_atlases[i].icon_name, name)) + return &s->icon_atlases[i]; + return NULL; +} + +/* Ensure GPU atlas for icon_name loaded. cu ctx pushed by caller. */ +static int ensure_icon_atlas(AVFilterContext *ctx, const char *icon_name, IconAtlas **out) +{ + CudaGridContext *s = ctx->priv; + IconAtlas *a; + uint8_t *cpu_rgba; + char path[512]; + int aw, ah; + CUdeviceptr dev_ptr = 0; + size_t dev_pitch = 0; + CUDA_MEMCPY2D cpy = { 0 }; + int ret; + + a = icon_atlas_find_locked(s, icon_name); + if (a && a->device_ptr) { *out = a; return 0; } + if (s->nb_icon_atlases >= MAX_ICON_ATLASES && !a) return AVERROR(ENOSPC); + + ret = resolve_icon_path(ctx, icon_name, path, sizeof(path)); + if (ret < 0) { + av_log(ctx, AV_LOG_WARNING, "icon '%s' not found in search paths\n", icon_name); + return ret; + } + + cpu_rgba = load_image_to_rgba_cpu(ctx, path, &aw, &ah); + if (!cpu_rgba) return AVERROR(EINVAL); + + ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuMemAllocPitch( + &dev_ptr, &dev_pitch, (size_t)aw * 4, (size_t)ah, 4)); + if (ret < 0) { av_free(cpu_rgba); return ret; } + + cpy.srcMemoryType = CU_MEMORYTYPE_HOST; + cpy.srcHost = cpu_rgba; + cpy.srcPitch = (size_t)aw * 4; + cpy.dstMemoryType = CU_MEMORYTYPE_DEVICE; + cpy.dstDevice = dev_ptr; + cpy.dstPitch = dev_pitch; + cpy.WidthInBytes = (size_t)aw * 4; + cpy.Height = (size_t)ah; + ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuMemcpy2D(&cpy)); + av_free(cpu_rgba); + if (ret < 0) { + s->hwctx->internal->cuda_dl->cuMemFree(dev_ptr); + return ret; + } + + if (!a) { + a = &s->icon_atlases[s->nb_icon_atlases++]; + memset(a, 0, sizeof(*a)); + av_strlcpy(a->icon_name, icon_name, sizeof(a->icon_name)); + } + a->device_ptr = dev_ptr; + a->device_pitch = dev_pitch; + a->w = aw; + a->h = ah; + *out = a; + av_log(ctx, AV_LOG_INFO, "icon '%s' loaded %s (%dx%d)\n", icon_name, path, aw, ah); + return 0; +} + +static int render_overlay_icon(AVFilterContext *ctx, AVFrame *out, const GridOverlay *ov) +{ + CudaGridContext *s = ctx->priv; + IconAtlas *a = NULL; + int px, py; + int base_x, base_y, base_w, base_h; + int aw, ah, ap, ealpha; + CUdeviceptr dst_y, dst_uv; + int ret; + + if (!ov->u.icon.icon_name[0]) return 0; + ret = ensure_icon_atlas(ctx, ov->u.icon.icon_name, &a); + if (ret < 0 || !a) return 0; /* non-fatal — skip */ + + if (ov->cell < 0) { + base_x = 0; base_y = 0; + base_w = s->out_width; base_h = s->out_height; + } else if (ov->cell < s->layout->nb_cells) { + base_x = s->cell_px[ov->cell].x; + base_y = s->cell_px[ov->cell].y; + base_w = s->cell_px[ov->cell].w; + base_h = s->cell_px[ov->cell].h; + } else return 0; + + px = base_x + (int)(ov->x * base_w); + py = base_y + (int)(ov->y * base_h); + px &= ~1; py &= ~1; + if (px >= s->out_width || py >= s->out_height) return 0; + + aw = a->w; ah = a->h; + if (px + aw > s->out_width) aw = s->out_width - px; + if (py + ah > s->out_height) ah = s->out_height - py; + aw &= ~1; ah &= ~1; + if (aw <= 0 || ah <= 0) return 0; + + ap = (int)a->device_pitch; + ealpha = ov->opacity; + + dst_y = (CUdeviceptr)out->data[0]; + { + int dst_pitch_y = out->linesize[0]; + CUdeviceptr atl = a->device_ptr; + void *args[] = { &dst_y, &dst_pitch_y, &px, &py, &atl, &ap, &aw, &ah, &ealpha }; + ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuLaunchKernel( + s->cu_func_alpha_blit_rgba_y, + DIV_UP(aw, BLOCKX), DIV_UP(ah, BLOCKY), 1, + BLOCKX, BLOCKY, 1, + 0, s->cu_stream, args, NULL)); + if (ret < 0) return ret; + } + dst_uv = (CUdeviceptr)out->data[1]; + { + int dst_pitch_uv = out->linesize[1]; + CUdeviceptr atl = a->device_ptr; + void *args[] = { &dst_uv, &dst_pitch_uv, &px, &py, &atl, &ap, &aw, &ah, &ealpha }; + ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuLaunchKernel( + s->cu_func_alpha_blit_rgba_uv, + DIV_UP(aw / 2, BLOCKX), DIV_UP(ah / 2, BLOCKY), 1, + BLOCKX, BLOCKY, 1, + 0, s->cu_stream, args, NULL)); + if (ret < 0) return ret; + } + return 0; +} + +static int render_overlay_rect(AVFilterContext *ctx, AVFrame *out, const GridOverlay *ov) +{ + CudaGridContext *s = ctx->priv; + int px, py, pw, ph; + uint8_t Y, U, V; + int ret; + + ret = overlay_pixel_rect(s, ov, &px, &py, &pw, &ph); + if (ret != 0) return ret < 0 ? ret : 0; + + rgb_to_yuv709(ov->u.rect.r, ov->u.rect.g, ov->u.rect.b, &Y, &U, &V); + + if (ov->u.rect.thickness <= 0) { + /* Filled rect */ + return render_strip_alpha(ctx, out, px, py, pw, ph, Y, U, V, ov->opacity); + } else { + int t = ov->u.rect.thickness; + t = FFMIN(t, FFMIN(pw / 2, ph / 2)); + if (t < 2) t = 2; + t &= ~1; + /* Top */ + ret = render_strip_alpha(ctx, out, px, py, pw, t, Y, U, V, ov->opacity); + if (ret < 0) return ret; + /* Bottom */ + ret = render_strip_alpha(ctx, out, px, py + ph - t, pw, t, Y, U, V, ov->opacity); + if (ret < 0) return ret; + /* Left */ + ret = render_strip_alpha(ctx, out, px, py + t, t, ph - 2 * t, Y, U, V, ov->opacity); + if (ret < 0) return ret; + /* Right */ + ret = render_strip_alpha(ctx, out, px + pw - t, py + t, t, ph - 2 * t, Y, U, V, ov->opacity); + return ret; + } +} + +/* Dim overlay: рамка тёмная (Y=16, neutral UV) полупрозрачная по cell. */ +static int render_overlay_dim(AVFilterContext *ctx, AVFrame *out, const GridOverlay *ov) +{ + CudaGridContext *s = ctx->priv; + int px, py, pw, ph; + int ret; + + ret = overlay_pixel_rect(s, ov, &px, &py, &pw, &ph); + if (ret != 0) return ret < 0 ? ret : 0; + + /* Y=16 = darkest legal Y (BT.709 limited range), UV=128 = neutral chroma */ + return render_strip_alpha(ctx, out, px, py, pw, ph, 16, 128, 128, ov->u.dim.amount); +} + +/* Render all visible overlays. cu ctx must be pushed by caller. */ +static int render_overlays(AVFilterContext *ctx, AVFrame *out) +{ + CudaGridContext *s = ctx->priv; + GridOverlay sorted[MAX_OVERLAYS]; + int i, j, n, ret; + + pthread_mutex_lock(&s->overlay_lock); + n = s->nb_overlays; + memcpy(sorted, s->overlays, (size_t)n * sizeof(GridOverlay)); + pthread_mutex_unlock(&s->overlay_lock); + + /* Insertion sort by z_order (stable). n ≤ 64 → fine. */ + for (i = 1; i < n; i++) { + GridOverlay tmp = sorted[i]; + for (j = i; j > 0 && sorted[j - 1].z_order > tmp.z_order; j--) + sorted[j] = sorted[j - 1]; + sorted[j] = tmp; + } + + for (i = 0; i < n; i++) { + const GridOverlay *ov = &sorted[i]; + if (!ov->visible) continue; + switch (ov->type) { + case OV_TYPE_RECT: + ret = render_overlay_rect(ctx, out, ov); + if (ret < 0) return ret; + break; + case OV_TYPE_DIM: + ret = render_overlay_dim(ctx, out, ov); + if (ret < 0) return ret; + break; + case OV_TYPE_TEXT: +#if CONFIG_LIBFREETYPE + ret = render_overlay_text(ctx, out, ov); + if (ret < 0) return ret; +#else + av_log(ctx, AV_LOG_TRACE, "overlay %s: text — libfreetype disabled at build\n", ov->id); +#endif + break; + case OV_TYPE_ICON: + ret = render_overlay_icon(ctx, out, ov); + if (ret < 0) return ret; + break; + } + } + return 0; +} + +/* ─── Framesync callback ──────────────────────────────────────────────── */ + +static int cuda_grid_compose(FFFrameSync *fs) +{ + AVFilterContext *ctx = fs->parent; + AVFilterLink *outlink = ctx->outputs[0]; + CudaGridContext *s = ctx->priv; + AVFrame *out = NULL; + AVFrame *in[MAX_CELLS] = {0}; + CUcontext dummy; + int i, ret; + int nb = s->layout->nb_cells; + + for (i = 0; i < nb; i++) { + ret = ff_framesync_get_frame(fs, i, &in[i], 0); + if (ret < 0) + return ret; + if (!in[i]) { + av_log(ctx, AV_LOG_WARNING, "input %d not ready, skipping frame\n", i); + return 0; + } + } + + out = ff_get_video_buffer(outlink, s->out_width, s->out_height); + if (!out) + return AVERROR(ENOMEM); + + ret = av_frame_copy_props(out, in[0]); + if (ret < 0) + goto fail; + out->width = s->out_width; + out->height = s->out_height; + + ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPushCurrent(s->cu_ctx)); + if (ret < 0) + goto fail; + + for (i = 0; i < nb; i++) { + AVFrame *src = in[i]; + int cx = s->cell_px[i].x; + int cy = s->cell_px[i].y; + int cw = s->cell_px[i].w; + int ch = s->cell_px[i].h; + + if (src->width != cw || src->height != ch) { + av_log(ctx, AV_LOG_ERROR, + "input %d size %dx%d != cell size %dx%d. " + "cuda_grid не делает scaling — используй upstream scale_npp:\n" + " [in%d]scale_npp=%d:%d[scaled%d]; [scaled%d]...cuda_grid=layout=%s\n", + i, src->width, src->height, cw, ch, + i, cw, ch, i, i, s->layout->name); + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); + ret = AVERROR(EINVAL); + goto fail; + } + + /* Y plane (1 byte per pixel) */ + ret = copy_input_plane(ctx, + (CUdeviceptr)src->data[0], src->linesize[0], + src->width, src->height, + (CUdeviceptr)out->data[0], out->linesize[0], + cx, cy, 1); + if (ret < 0) { + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); + goto fail; + } + + /* UV plane (NV12: half resolution, 2 bytes per "pixel") */ + ret = copy_input_plane(ctx, + (CUdeviceptr)src->data[1], src->linesize[1], + src->width / 2, src->height / 2, + (CUdeviceptr)out->data[1], out->linesize[1], + cx / 2, cy / 2, 2); + if (ret < 0) { + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); + goto fail; + } + } + + /* Overlay pass (Phase 4b) */ + ret = render_overlays(ctx, out); + if (ret < 0) { + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); + goto fail; + } + + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); + return ff_filter_frame(outlink, out); + +fail: + av_frame_free(&out); + return ret; +} + +/* ─── Lifecycle ────────────────────────────────────────────────────────── */ + +static av_cold int cuda_grid_init(AVFilterContext *ctx) +{ + CudaGridContext *s = ctx->priv; + const LayoutTemplate *lt; + int i, ret; + + lt = find_layout(s->layout_name); + if (!lt) { + av_log(ctx, AV_LOG_ERROR, "unknown layout '%s'. Доступны: ", s->layout_name); + for (i = 0; i < (int)FF_ARRAY_ELEMS(layouts); i++) + av_log(ctx, AV_LOG_ERROR, "%s ", layouts[i].name); + av_log(ctx, AV_LOG_ERROR, "\n"); + return AVERROR(EINVAL); + } + s->layout = lt; + + /* Dynamic inputs — append pad per cell */ + for (i = 0; i < lt->nb_cells; i++) { + AVFilterPad pad = { 0 }; + pad.type = AVMEDIA_TYPE_VIDEO; + pad.name = av_asprintf("input%d", i); + if (!pad.name) + return AVERROR(ENOMEM); + ret = ff_append_inpad_free_name(ctx, &pad); + if (ret < 0) + return ret; + } + + /* Overlay mutex */ + ret = pthread_mutex_init(&s->overlay_lock, NULL); + if (ret) { + av_log(ctx, AV_LOG_ERROR, "overlay_lock init failed: %d\n", ret); + return AVERROR(ENOMEM); + } + s->overlay_lock_inited = 1; + + av_log(ctx, AV_LOG_INFO, "cuda_grid layout=%s cells=%d output=%dx%d (overlays: rect)\n", + lt->name, lt->nb_cells, s->out_width, s->out_height); + return 0; +} + +static av_cold void cuda_grid_uninit(AVFilterContext *ctx) +{ + CudaGridContext *s = ctx->priv; + int i; + ff_framesync_uninit(&s->fs); + + if (s->hwctx) { + CUcontext dummy; + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPushCurrent(s->cu_ctx)); + for (i = 0; i < s->nb_atlases; i++) { + if (s->atlases[i].device_ptr) { + s->hwctx->internal->cuda_dl->cuMemFree(s->atlases[i].device_ptr); + s->atlases[i].device_ptr = 0; + } + } + s->nb_atlases = 0; + for (i = 0; i < s->nb_icon_atlases; i++) { + if (s->icon_atlases[i].device_ptr) { + s->hwctx->internal->cuda_dl->cuMemFree(s->icon_atlases[i].device_ptr); + s->icon_atlases[i].device_ptr = 0; + } + } + s->nb_icon_atlases = 0; + if (s->cu_module) { + CHECK_CU(s->hwctx->internal->cuda_dl->cuModuleUnload(s->cu_module)); + s->cu_module = NULL; + } + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); + } +#if CONFIG_LIBFREETYPE + if (s->ft_face) { FT_Done_Face(s->ft_face); s->ft_face = NULL; } + if (s->ft_lib) { FT_Done_FreeType(s->ft_lib); s->ft_lib = NULL; } +#endif + if (s->overlay_lock_inited) { + pthread_mutex_destroy(&s->overlay_lock); + s->overlay_lock_inited = 0; + } +} + +static int cuda_grid_process_command(AVFilterContext *ctx, const char *cmd, + const char *arg, char *res, int res_len, int flags) +{ + CudaGridContext *s = ctx->priv; + int ret; + + if (!strcmp(cmd, "add_overlay")) { + GridOverlay ov; + ret = parse_overlay_args(ctx, arg, &ov); + if (ret < 0) { + if (res) av_strlcpy(res, "err parse", res_len); + return ret; + } + pthread_mutex_lock(&s->overlay_lock); + ret = overlay_upsert_locked(s, &ov); + pthread_mutex_unlock(&s->overlay_lock); + if (res) snprintf(res, res_len, ret == 0 ? "ok id=%s n=%d" : "err full id=%s", + ov.id, s->nb_overlays); + return ret; + } + + if (!strcmp(cmd, "remove_overlay")) { + char id[OVERLAY_ID_MAX]; + if (!arg || sscanf(arg, "%31s", id) != 1) { + if (res) av_strlcpy(res, "err parse", res_len); + return AVERROR(EINVAL); + } + pthread_mutex_lock(&s->overlay_lock); + ret = overlay_remove_locked(s, id); + /* Best-effort atlas GPU cleanup: cu ctx may not be pushed here, push it. */ + if (ret == 0 && s->hwctx) { + CUcontext dummy; + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPushCurrent(s->cu_ctx)); + atlas_remove_locked(s, id); + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); + } + pthread_mutex_unlock(&s->overlay_lock); + if (res) snprintf(res, res_len, ret == 0 ? "ok id=%s" : "not_found id=%s", id); + return ret; + } + + if (!strcmp(cmd, "clear_overlays")) { + int i; + pthread_mutex_lock(&s->overlay_lock); + s->nb_overlays = 0; + if (s->hwctx) { + CUcontext dummy; + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPushCurrent(s->cu_ctx)); + for (i = 0; i < s->nb_atlases; i++) + atlas_free_gpu_locked(s, &s->atlases[i]); + s->nb_atlases = 0; + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); + } + pthread_mutex_unlock(&s->overlay_lock); + if (res) av_strlcpy(res, "ok", res_len); + return 0; + } + + if (!strcmp(cmd, "reload_icon")) { + /* Invalidate cached icon atlas by name — next render re-reads file from disk. + * Used Phase 6 by controller для periodic chart/chat re-rendering. */ + char name[OVERLAY_ICON_NAME_MAX]; + int i, found = -1; + if (!arg || sscanf(arg, "%31s", name) != 1) { + if (res) av_strlcpy(res, "err parse", res_len); + return AVERROR(EINVAL); + } + pthread_mutex_lock(&s->overlay_lock); + if (s->hwctx) { + CUcontext dummy; + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPushCurrent(s->cu_ctx)); + for (i = 0; i < s->nb_icon_atlases; i++) { + if (!strcmp(s->icon_atlases[i].icon_name, name)) { + if (s->icon_atlases[i].device_ptr) { + s->hwctx->internal->cuda_dl->cuMemFree(s->icon_atlases[i].device_ptr); + s->icon_atlases[i].device_ptr = 0; + } + memmove(&s->icon_atlases[i], &s->icon_atlases[i + 1], + (s->nb_icon_atlases - i - 1) * sizeof(IconAtlas)); + s->nb_icon_atlases--; + found = i; + break; + } + } + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); + } + pthread_mutex_unlock(&s->overlay_lock); + if (res) snprintf(res, res_len, found >= 0 ? "ok %s reloaded" : "not_cached %s", name); + return 0; + } + + /* Future: set_layout (Phase 3 на filter side — пока nb_inputs хардкодится в init) */ + + /* Fall back to standard option/command handling */ + return ff_filter_process_command(ctx, cmd, arg, res, res_len, flags); +} + +static int cuda_grid_config_output(AVFilterLink *outlink) +{ + AVFilterContext *ctx = outlink->src; + CudaGridContext *s = ctx->priv; + AVFilterLink *in0 = ctx->inputs[0]; + FilterLink *inl0 = ff_filter_link(in0); + FilterLink *outl = ff_filter_link(outlink); + AVHWFramesContext *hwfc0; + AVHWDeviceContext *hwdev; + AVBufferRef *out_ref; + AVHWFramesContext *out_hwfc; + int i, ret; + + if (!inl0->hw_frames_ctx) + return AVERROR(EINVAL); + hwfc0 = (AVHWFramesContext *)inl0->hw_frames_ctx->data; + + if (hwfc0->sw_format != AV_PIX_FMT_NV12) { + av_log(ctx, AV_LOG_ERROR, + "Phase 1-2a поддерживают только NV12, got %s\n", + av_get_pix_fmt_name(hwfc0->sw_format)); + return AVERROR(EINVAL); + } + + /* Compute pixel rects из normalized layout × output size */ + for (i = 0; i < s->layout->nb_cells; i++) { + s->cell_px[i].x = (int)(s->layout->cells[i].x * s->out_width); + s->cell_px[i].y = (int)(s->layout->cells[i].y * s->out_height); + s->cell_px[i].w = (int)(s->layout->cells[i].w * s->out_width); + s->cell_px[i].h = (int)(s->layout->cells[i].h * s->out_height); + /* Align до chroma boundary (NV12 → 2x2) */ + s->cell_px[i].x &= ~1; + s->cell_px[i].y &= ~1; + s->cell_px[i].w &= ~1; + s->cell_px[i].h &= ~1; + av_log(ctx, AV_LOG_VERBOSE, " cell[%d] = %dx%d @ (%d,%d)\n", + i, s->cell_px[i].w, s->cell_px[i].h, s->cell_px[i].x, s->cell_px[i].y); + } + + /* Validate все inputs: device match + sw_format match */ + for (i = 1; i < s->layout->nb_cells; i++) { + AVFilterLink *inN = ctx->inputs[i]; + FilterLink *ilN = ff_filter_link(inN); + AVHWFramesContext *hN; + if (!ilN->hw_frames_ctx) + return AVERROR(EINVAL); + hN = (AVHWFramesContext *)ilN->hw_frames_ctx->data; + if (hN->device_ctx != hwfc0->device_ctx) { + av_log(ctx, AV_LOG_ERROR, "input %d device mismatch\n", i); + return AVERROR(EINVAL); + } + if (hN->sw_format != hwfc0->sw_format) { + av_log(ctx, AV_LOG_ERROR, "input %d sw_format mismatch\n", i); + return AVERROR(EINVAL); + } + } + + outlink->w = s->out_width; + outlink->h = s->out_height; + + hwdev = hwfc0->device_ctx; + s->hwctx = (AVCUDADeviceContext *)hwdev->hwctx; + s->cu_ctx = s->hwctx->cuda_ctx; + s->cu_stream = s->hwctx->stream; + + /* Load CUDA module + resolve kernel handles (Phase 4b-2 alpha kernels) */ + { + CUcontext dummy; + ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPushCurrent(s->cu_ctx)); + if (ret < 0) return ret; + + ret = ff_cuda_load_module(ctx, s->hwctx, &s->cu_module, + ff_vf_cuda_grid_ptx_data, ff_vf_cuda_grid_ptx_len); + if (ret < 0) { + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); + return ret; + } + +#define GET_FN(handle, name) \ + ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuModuleGetFunction( \ + &s->handle, s->cu_module, name)); \ + if (ret < 0) { \ + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); \ + return ret; \ + } + GET_FN(cu_func_alpha_fill_y, "Alpha_Fill_Y"); + GET_FN(cu_func_alpha_fill_uv, "Alpha_Fill_UV"); + GET_FN(cu_func_alpha_blit_rgba_y, "Alpha_Blit_RGBA_Y"); + GET_FN(cu_func_alpha_blit_rgba_uv, "Alpha_Blit_RGBA_UV"); +#undef GET_FN + + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); + } + + out_ref = av_hwframe_ctx_alloc(hwfc0->device_ref); + if (!out_ref) + return AVERROR(ENOMEM); + out_hwfc = (AVHWFramesContext *)out_ref->data; + out_hwfc->format = AV_PIX_FMT_CUDA; + out_hwfc->sw_format = AV_PIX_FMT_NV12; + out_hwfc->width = s->out_width; + out_hwfc->height = s->out_height; + out_hwfc->initial_pool_size = 4; + + ret = av_hwframe_ctx_init(out_ref); + if (ret < 0) { + av_buffer_unref(&out_ref); + return ret; + } + outl->hw_frames_ctx = out_ref; + + ret = ff_framesync_init(&s->fs, ctx, s->layout->nb_cells); + if (ret < 0) + return ret; + { + FFFrameSyncIn *fs_in; + for (i = 0; i < s->layout->nb_cells; i++) { + fs_in = &s->fs.in[i]; + fs_in->time_base = ctx->inputs[i]->time_base; + fs_in->sync = 1; + fs_in->before = EXT_STOP; + fs_in->after = EXT_INFINITY; + } + } + s->fs.opaque = s; + s->fs.on_event = cuda_grid_compose; + + outlink->time_base = ctx->inputs[0]->time_base; + return ff_framesync_configure(&s->fs); +} + +static int cuda_grid_activate(AVFilterContext *ctx) +{ + CudaGridContext *s = ctx->priv; + return ff_framesync_activate(&s->fs); +} + +/* ─── Options + registration ───────────────────────────────────────────── */ + +#define OFFSET(x) offsetof(CudaGridContext, x) +#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM) + +static const AVOption cuda_grid_options[] = { + { "layout", "имя layout template", + OFFSET(layout_name), AV_OPT_TYPE_STRING, { .str = "quad" }, 0, 0, FLAGS }, + { "out_w", "ширина output frame в пикселях", + OFFSET(out_width), AV_OPT_TYPE_INT, { .i64 = 1920 }, 16, 16384, FLAGS }, + { "out_h", "высота output frame в пикселях", + OFFSET(out_height), AV_OPT_TYPE_INT, { .i64 = 1080 }, 16, 16384, FLAGS }, + { "font_file", "TTF/OTF font path (default: search DejaVu/Liberation)", + OFFSET(font_file), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, FLAGS }, + { "font_size", "default text overlay size (px), используется если overlay не указал свой", + OFFSET(default_font_size), AV_OPT_TYPE_INT, { .i64 = 24 }, 6, 256, FLAGS }, + { "icon_dir", "directory для resolve icon_name (по умолчанию /var/lib/cuda-grid/icons)", + OFFSET(icon_dir), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, FLAGS }, + { NULL } +}; + +static const AVClass cuda_grid_class = { + .class_name = "cuda_grid", + .item_name = av_default_item_name, + .option = cuda_grid_options, + .version = LIBAVUTIL_VERSION_INT, + .category = AV_CLASS_CATEGORY_FILTER, +}; + +static const AVFilterPad cuda_grid_outputs[] = { + { .name = "default", .type = AVMEDIA_TYPE_VIDEO, .config_props = cuda_grid_config_output }, +}; + +const AVFilter ff_vf_cuda_grid = { + .name = "cuda_grid", + .description = NULL_IF_CONFIG_SMALL("GPU-native video grid composer (CUDA)."), + .priv_class = &cuda_grid_class, + .priv_size = sizeof(CudaGridContext), + .init = cuda_grid_init, + .uninit = cuda_grid_uninit, + .activate = cuda_grid_activate, + .process_command = cuda_grid_process_command, + /* No FILTER_INPUTS — pads added dynamically в init() per layout. */ + FILTER_OUTPUTS(cuda_grid_outputs), + FILTER_SINGLE_PIXFMT(AV_PIX_FMT_CUDA), + .flags = AVFILTER_FLAG_HWDEVICE | AVFILTER_FLAG_DYNAMIC_INPUTS, + .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE, +}; diff --git a/libavfilter/vf_cuda_grid.cu b/libavfilter/vf_cuda_grid.cu new file mode 100644 index 0000000..10d2af6 --- /dev/null +++ b/libavfilter/vf_cuda_grid.cu @@ -0,0 +1,121 @@ +/* + * cuda_grid overlay CUDA kernels. + * + * Алфа-блендинг poверх NV12 frame: + * - Alpha_Fill_Y / Alpha_Fill_UV: solid colour fill region (rect/dim) + * - Alpha_Blit_RGBA_Y / Alpha_Blit_RGBA_UV: blit RGBA atlas → NV12 + * (для text Phase 4b-3 и icon Phase 4b-4) + * + * BT.709 limited-range conversion (HDTV). См. также vf_cuda_grid.c rgb_to_yuv709. + * + * Лицензия: LGPL-2.1+ (соответствует FFmpeg) + */ + +#include "cuda/vector_helpers.cuh" + +extern "C" { + +/* Solid colour α-blend на Y plane. + * dst: pointer на Y plane base + * dst_pitch: bytes per row + * rx, ry, rw, rh: region rect (pixels, must be in-bounds) + * fill: 0..255 (Y component to blend in) + * alpha: 0..255 (255 = fully opaque) + */ +__global__ void Alpha_Fill_Y(unsigned char *dst, int dst_pitch, + int rx, int ry, int rw, int rh, + int fill, int alpha) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= rw || y >= rh) return; + unsigned char *p = dst + (ry + y) * dst_pitch + (rx + x); + int cur = *p; + *p = (unsigned char)((fill * alpha + cur * (255 - alpha)) / 255); +} + +/* Solid colour α-blend на UV plane (NV12 interleaved). + * rx, ry: in chroma plane coords (= full-res Y x/2, y/2) + * rw, rh: also в chroma coords + */ +__global__ void Alpha_Fill_UV(unsigned char *dst, int dst_pitch, + int rx, int ry, int rw, int rh, + int fill_u, int fill_v, int alpha) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= rw || y >= rh) return; + unsigned char *p = dst + (ry + y) * dst_pitch + (rx + x) * 2; + int cu = p[0], cv = p[1]; + p[0] = (unsigned char)((fill_u * alpha + cu * (255 - alpha)) / 255); + p[1] = (unsigned char)((fill_v * alpha + cv * (255 - alpha)) / 255); +} + +/* Blit RGBA atlas → Y plane с α-blending. + * dst, dst_pitch: Y plane + * dx, dy: destination pixel offset (full-res coords) + * atlas, atlas_pitch: RGBA source (interleaved R,G,B,A bytes), pitch в bytes + * w, h: atlas dimensions (in pixels) + * extra_alpha: дополнительный множитель (0..255) — overlay-level opacity + */ +__global__ void Alpha_Blit_RGBA_Y(unsigned char *dst, int dst_pitch, + int dx, int dy, + const unsigned char *atlas, int atlas_pitch, + int w, int h, int extra_alpha) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= w || y >= h) return; + const unsigned char *sp = atlas + y * atlas_pitch + x * 4; + int r = sp[0], g = sp[1], b = sp[2], a = sp[3]; + a = a * extra_alpha / 255; + if (a == 0) return; + int Y = (int)(0.183f * r + 0.614f * g + 0.062f * b) + 16; + Y = Y < 0 ? 0 : (Y > 255 ? 255 : Y); + unsigned char *p = dst + (dy + y) * dst_pitch + (dx + x); + int cur = *p; + *p = (unsigned char)((Y * a + cur * (255 - a)) / 255); +} + +/* Blit RGBA atlas → UV plane с 4:2:0 chroma subsampling. + * dst, dst_pitch: UV plane + * dx, dy: destination pixel offset (full-res coords; обе должны быть кратны 2) + * atlas, atlas_pitch: RGBA source + * w, h: atlas dimensions (full-res; UV operates на w/2 × h/2) + * extra_alpha: 0..255 + */ +__global__ void Alpha_Blit_RGBA_UV(unsigned char *dst, int dst_pitch, + int dx, int dy, + const unsigned char *atlas, int atlas_pitch, + int w, int h, int extra_alpha) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + int hw = w / 2, hh = h / 2; + if (x >= hw || y >= hh) return; + + int sx = x * 2, sy = y * 2; + const unsigned char *row0 = atlas + sy * atlas_pitch; + const unsigned char *row1 = atlas + (sy + 1) * atlas_pitch; + + int r = (row0[sx*4+0] + row0[(sx+1)*4+0] + row1[sx*4+0] + row1[(sx+1)*4+0]) >> 2; + int g = (row0[sx*4+1] + row0[(sx+1)*4+1] + row1[sx*4+1] + row1[(sx+1)*4+1]) >> 2; + int b = (row0[sx*4+2] + row0[(sx+1)*4+2] + row1[sx*4+2] + row1[(sx+1)*4+2]) >> 2; + int a = (row0[sx*4+3] + row0[(sx+1)*4+3] + row1[sx*4+3] + row1[(sx+1)*4+3]) >> 2; + a = a * extra_alpha / 255; + if (a == 0) return; + + int U = (int)(-0.101f * r - 0.339f * g + 0.439f * b) + 128; + int V = (int)( 0.439f * r - 0.399f * g - 0.040f * b) + 128; + U = U < 0 ? 0 : (U > 255 ? 255 : U); + V = V < 0 ? 0 : (V > 255 ? 255 : V); + + int du = dx / 2 + x; + int dv = dy / 2 + y; + unsigned char *p = dst + dv * dst_pitch + du * 2; + int cu = p[0], cv = p[1]; + p[0] = (unsigned char)((U * a + cu * (255 - a)) / 255); + p[1] = (unsigned char)((V * a + cv * (255 - a)) / 255); +} + +} /* extern "C" */