diff --git a/libavfilter/vf_cuda_grid.c b/libavfilter/vf_cuda_grid.c index 675fb15..11735e1 100644 --- a/libavfilter/vf_cuda_grid.c +++ b/libavfilter/vf_cuda_grid.c @@ -31,6 +31,8 @@ #include "config_components.h" +#include + #include "libavutil/avstring.h" #include "libavutil/common.h" #include "libavutil/cuda_check.h" @@ -42,6 +44,12 @@ #include "libavutil/pixdesc.h" #include "libavutil/thread.h" +#include "config.h" +#if CONFIG_LIBFREETYPE +#include +#include FT_FREETYPE_H +#endif + #include "avfilter.h" #include "cuda/load_helper.h" #include "filters.h" @@ -183,6 +191,17 @@ static const LayoutTemplate *find_layout(const char *name) /* ─── Filter state ─────────────────────────────────────────────────────── */ +/* Text atlas — rasterized RGBA buffer on GPU, keyed by overlay id (Phase 4b-3). */ +typedef struct TextAtlas { + char id[OVERLAY_ID_MAX]; + char cached_text[OVERLAY_TEXT_MAX]; + int cached_font_size; + uint8_t cached_r, cached_g, cached_b; + CUdeviceptr device_ptr; + size_t device_pitch; + int w, h; +} TextAtlas; + typedef struct CudaGridContext { const AVClass *class; @@ -190,6 +209,8 @@ typedef struct CudaGridContext { char *layout_name; int out_width; int out_height; + char *font_file; + int default_font_size; /* Resolved layout (после init) */ const LayoutTemplate *layout; @@ -218,6 +239,16 @@ typedef struct CudaGridContext { CUfunction cu_func_alpha_fill_uv; CUfunction cu_func_alpha_blit_rgba_y; CUfunction cu_func_alpha_blit_rgba_uv; + + /* Text rendering (Phase 4b-3, libfreetype optional) */ +#if CONFIG_LIBFREETYPE + FT_Library ft_lib; + FT_Face ft_face; + int ft_ready; /* 1 once library + face loaded */ + int ft_init_attempted; /* set on first attempt, success or fail */ +#endif + TextAtlas atlases[MAX_OVERLAYS]; + int nb_atlases; } CudaGridContext; /* ─── Composition: copy одного input plane в target region output ──────── */ @@ -456,6 +487,344 @@ static int render_strip_alpha(AVFilterContext *ctx, AVFrame *out, return 0; } +/* ─── Text rendering (Phase 4b-3, freetype) ────────────────────────────── */ + +#if CONFIG_LIBFREETYPE + +static const char *const default_fonts[] = { + "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", + "/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf", + "/usr/share/fonts/TTF/DejaVuSans-Bold.ttf", + "/usr/share/fonts/dejavu/DejaVuSans-Bold.ttf", + NULL, +}; + +static int ensure_ft_loaded(AVFilterContext *ctx) +{ + CudaGridContext *s = ctx->priv; + const char *path = s->font_file; + FT_Error err; + int i; + + if (s->ft_ready) return 0; + if (s->ft_init_attempted) return AVERROR(ENOSYS); + s->ft_init_attempted = 1; + + err = FT_Init_FreeType(&s->ft_lib); + if (err) { + av_log(ctx, AV_LOG_WARNING, "FT_Init_FreeType failed (err=%d), text overlays disabled\n", err); + return AVERROR(ENOSYS); + } + + if (!path) { + for (i = 0; default_fonts[i]; i++) { + if (access(default_fonts[i], R_OK) == 0) { + path = default_fonts[i]; + break; + } + } + } + if (!path) { + av_log(ctx, AV_LOG_WARNING, "no font found (set font_file= option), text disabled\n"); + FT_Done_FreeType(s->ft_lib); + s->ft_lib = NULL; + return AVERROR(ENOENT); + } + + err = FT_New_Face(s->ft_lib, path, 0, &s->ft_face); + if (err) { + av_log(ctx, AV_LOG_WARNING, "FT_New_Face(%s) failed: %d, text disabled\n", path, err); + FT_Done_FreeType(s->ft_lib); + s->ft_lib = NULL; + return AVERROR(EIO); + } + + av_log(ctx, AV_LOG_INFO, "freetype: loaded %s\n", path); + s->ft_ready = 1; + return 0; +} + +/* Two-pass rasterize: 1) measure total_w / ascent / descent, 2) blit to RGBA buffer. + * Returns malloc'd RGBA buffer (caller frees) + dims via out params, or NULL. */ +static uint8_t *rasterize_text_rgba(AVFilterContext *ctx, const char *text, int font_size, + uint8_t r, uint8_t g, uint8_t b, + int *out_w, int *out_h) +{ + CudaGridContext *s = ctx->priv; + FT_Error err; + const unsigned char *p; + int total_w = 0, ascent_px = 0, descent_px = 0; + int atlas_w, atlas_h, pen_x, baseline_y; + uint8_t *atlas; + + if (!s->ft_ready) return NULL; + err = FT_Set_Pixel_Sizes(s->ft_face, 0, font_size); + if (err) { + av_log(ctx, AV_LOG_WARNING, "FT_Set_Pixel_Sizes failed: %d\n", err); + return NULL; + } + + /* Pass 1: measure */ + for (p = (const unsigned char *)text; *p; p++) { + FT_UInt gi; + FT_GlyphSlot g_slot; + gi = FT_Get_Char_Index(s->ft_face, *p); + if (!gi) continue; + if (FT_Load_Glyph(s->ft_face, gi, FT_LOAD_DEFAULT)) continue; + g_slot = s->ft_face->glyph; + total_w += (g_slot->advance.x >> 6); + ascent_px = FFMAX(ascent_px, g_slot->bitmap_top); + descent_px = FFMAX(descent_px, (int)g_slot->bitmap.rows - g_slot->bitmap_top); + } + if (total_w <= 0) return NULL; + + atlas_w = total_w + 4; /* padding */ + atlas_h = ascent_px + descent_px + 4; + if (atlas_h <= 0) atlas_h = font_size; + + atlas = av_mallocz((size_t)atlas_w * atlas_h * 4); + if (!atlas) return NULL; + + /* Pass 2: blit */ + pen_x = 2; + baseline_y = ascent_px + 2; + for (p = (const unsigned char *)text; *p; p++) { + FT_UInt gi; + FT_GlyphSlot g_slot; + int gx, gy, bw, bh; + int yy, xx; + + gi = FT_Get_Char_Index(s->ft_face, *p); + if (!gi) continue; + if (FT_Load_Glyph(s->ft_face, gi, FT_LOAD_RENDER)) continue; + g_slot = s->ft_face->glyph; + if (g_slot->bitmap.pixel_mode != FT_PIXEL_MODE_GRAY) { + pen_x += g_slot->advance.x >> 6; + continue; + } + bw = g_slot->bitmap.width; + bh = g_slot->bitmap.rows; + gx = pen_x + g_slot->bitmap_left; + gy = baseline_y - g_slot->bitmap_top; + + for (yy = 0; yy < bh; yy++) { + int dy = gy + yy; + if (dy < 0 || dy >= atlas_h) continue; + for (xx = 0; xx < bw; xx++) { + int dx = gx + xx; + uint8_t gray; + uint8_t *dst; + if (dx < 0 || dx >= atlas_w) continue; + gray = g_slot->bitmap.buffer[yy * g_slot->bitmap.pitch + xx]; + if (gray == 0) continue; + dst = atlas + ((size_t)dy * atlas_w + dx) * 4; + /* Premultiplied: max() — text "fills" rather than blending overlaps */ + if (gray > dst[3]) { + dst[0] = r; + dst[1] = g; + dst[2] = b; + dst[3] = gray; + } + } + } + pen_x += g_slot->advance.x >> 6; + } + + *out_w = atlas_w; + *out_h = atlas_h; + return atlas; +} + +#endif /* CONFIG_LIBFREETYPE */ + +/* Cache lookup — locked by caller. */ +static TextAtlas *atlas_find_locked(CudaGridContext *s, const char *id) +{ + int i; + for (i = 0; i < s->nb_atlases; i++) + if (!strcmp(s->atlases[i].id, id)) + return &s->atlases[i]; + return NULL; +} + +static void atlas_free_gpu_locked(CudaGridContext *s, TextAtlas *a) +{ + if (a->device_ptr) { + s->hwctx->internal->cuda_dl->cuMemFree(a->device_ptr); + a->device_ptr = 0; + } +} + +static void atlas_remove_locked(CudaGridContext *s, const char *id) +{ + int i; + for (i = 0; i < s->nb_atlases; i++) { + if (!strcmp(s->atlases[i].id, id)) { + atlas_free_gpu_locked(s, &s->atlases[i]); + memmove(&s->atlases[i], &s->atlases[i + 1], + (s->nb_atlases - i - 1) * sizeof(TextAtlas)); + s->nb_atlases--; + return; + } + } +} + +#if CONFIG_LIBFREETYPE +/* Ensure GPU atlas for given overlay matches its text/font_size/color. cu ctx pushed by caller. */ +static int ensure_text_atlas(AVFilterContext *ctx, const GridOverlay *ov, TextAtlas **out_atlas) +{ + CudaGridContext *s = ctx->priv; + TextAtlas *a; + uint8_t *cpu_atlas; + int aw, ah; + int font_size; + CUdeviceptr dev_ptr = 0; + size_t dev_pitch = 0; + CUDA_MEMCPY2D cpy = { 0 }; + int ret; + + font_size = ov->u.text.font_size > 0 ? ov->u.text.font_size : s->default_font_size; + + a = atlas_find_locked(s, ov->id); + if (a && + !strcmp(a->cached_text, ov->u.text.text) && + a->cached_font_size == font_size && + a->cached_r == ov->u.text.r && + a->cached_g == ov->u.text.g && + a->cached_b == ov->u.text.b) { + *out_atlas = a; + return 0; + } + + if (s->nb_atlases >= MAX_OVERLAYS && !a) + return AVERROR(ENOSPC); + + ret = ensure_ft_loaded(ctx); + if (ret < 0) return ret; + + cpu_atlas = rasterize_text_rgba(ctx, ov->u.text.text, font_size, + ov->u.text.r, ov->u.text.g, ov->u.text.b, + &aw, &ah); + if (!cpu_atlas) return AVERROR(EINVAL); + + /* Allocate pitched device buffer + upload */ + ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuMemAllocPitch( + &dev_ptr, &dev_pitch, (size_t)aw * 4, (size_t)ah, 4)); + if (ret < 0) { av_free(cpu_atlas); return ret; } + + cpy.srcMemoryType = CU_MEMORYTYPE_HOST; + cpy.srcHost = cpu_atlas; + cpy.srcPitch = (size_t)aw * 4; + cpy.dstMemoryType = CU_MEMORYTYPE_DEVICE; + cpy.dstDevice = dev_ptr; + cpy.dstPitch = dev_pitch; + cpy.WidthInBytes = (size_t)aw * 4; + cpy.Height = (size_t)ah; + ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuMemcpy2D(&cpy)); + av_free(cpu_atlas); + if (ret < 0) { + s->hwctx->internal->cuda_dl->cuMemFree(dev_ptr); + return ret; + } + + if (a) { + atlas_free_gpu_locked(s, a); + } else { + a = &s->atlases[s->nb_atlases++]; + memset(a, 0, sizeof(*a)); + av_strlcpy(a->id, ov->id, sizeof(a->id)); + } + av_strlcpy(a->cached_text, ov->u.text.text, sizeof(a->cached_text)); + a->cached_font_size = font_size; + a->cached_r = ov->u.text.r; + a->cached_g = ov->u.text.g; + a->cached_b = ov->u.text.b; + a->device_ptr = dev_ptr; + a->device_pitch = dev_pitch; + a->w = aw; + a->h = ah; + *out_atlas = a; + return 0; +} + +static int render_overlay_text(AVFilterContext *ctx, AVFrame *out, const GridOverlay *ov) +{ + CudaGridContext *s = ctx->priv; + TextAtlas *a = NULL; + int px, py; + int base_x, base_y, base_w, base_h; + CUdeviceptr dst_y, dst_uv; + int ret; + int aw, ah, ap, ealpha; + + if (!ov->u.text.text[0]) return 0; + ret = ensure_text_atlas(ctx, ov, &a); + if (ret < 0) { + av_log(ctx, AV_LOG_WARNING, "text overlay %s: atlas failed (ret=%d)\n", ov->id, ret); + return 0; /* non-fatal — skip */ + } + if (!a) return 0; + + /* Position: x,y normalized within cell or output (top-left corner of atlas). */ + if (ov->cell < 0) { + base_x = 0; base_y = 0; + base_w = s->out_width; base_h = s->out_height; + } else if (ov->cell < s->layout->nb_cells) { + base_x = s->cell_px[ov->cell].x; + base_y = s->cell_px[ov->cell].y; + base_w = s->cell_px[ov->cell].w; + base_h = s->cell_px[ov->cell].h; + } else { + return 0; + } + px = base_x + (int)(ov->x * base_w); + py = base_y + (int)(ov->y * base_h); + px &= ~1; py &= ~1; + /* Clip — if atlas runs past output bounds, kernels still safe (they bounds-check + * against atlas w/h, not dst). But avoid going off output: simple skip when fully outside. */ + if (px >= s->out_width || py >= s->out_height) return 0; + + aw = a->w; ah = a->h; + /* Reduce ah if it overflows output (avoid OOB on Y plane); UV plane uses h/2 so same scale. */ + if (px + aw > s->out_width) aw = s->out_width - px; + if (py + ah > s->out_height) ah = s->out_height - py; + aw &= ~1; ah &= ~1; + if (aw <= 0 || ah <= 0) return 0; + + ap = (int)a->device_pitch; + ealpha = ov->opacity; + + /* Y plane */ + dst_y = (CUdeviceptr)out->data[0]; + { + int dst_pitch_y = out->linesize[0]; + CUdeviceptr atl = a->device_ptr; + void *args[] = { &dst_y, &dst_pitch_y, &px, &py, &atl, &ap, &aw, &ah, &ealpha }; + ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuLaunchKernel( + s->cu_func_alpha_blit_rgba_y, + DIV_UP(aw, BLOCKX), DIV_UP(ah, BLOCKY), 1, + BLOCKX, BLOCKY, 1, + 0, s->cu_stream, args, NULL)); + if (ret < 0) return ret; + } + /* UV plane (kernel does internal half-res subsampling) */ + dst_uv = (CUdeviceptr)out->data[1]; + { + int dst_pitch_uv = out->linesize[1]; + CUdeviceptr atl = a->device_ptr; + void *args[] = { &dst_uv, &dst_pitch_uv, &px, &py, &atl, &ap, &aw, &ah, &ealpha }; + ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuLaunchKernel( + s->cu_func_alpha_blit_rgba_uv, + DIV_UP(aw / 2, BLOCKX), DIV_UP(ah / 2, BLOCKY), 1, + BLOCKX, BLOCKY, 1, + 0, s->cu_stream, args, NULL)); + if (ret < 0) return ret; + } + return 0; +} + +#endif /* CONFIG_LIBFREETYPE */ + static int render_overlay_rect(AVFilterContext *ctx, AVFrame *out, const GridOverlay *ov) { CudaGridContext *s = ctx->priv; @@ -538,7 +907,12 @@ static int render_overlays(AVFilterContext *ctx, AVFrame *out) if (ret < 0) return ret; break; case OV_TYPE_TEXT: - av_log(ctx, AV_LOG_TRACE, "overlay %s: text — Phase 4b-3 (freetype)\n", ov->id); +#if CONFIG_LIBFREETYPE + ret = render_overlay_text(ctx, out, ov); + if (ret < 0) return ret; +#else + av_log(ctx, AV_LOG_TRACE, "overlay %s: text — libfreetype disabled at build\n", ov->id); +#endif break; case OV_TYPE_ICON: av_log(ctx, AV_LOG_TRACE, "overlay %s: icon — Phase 4b-4 (sprite)\n", ov->id); @@ -688,14 +1062,29 @@ static av_cold int cuda_grid_init(AVFilterContext *ctx) static av_cold void cuda_grid_uninit(AVFilterContext *ctx) { CudaGridContext *s = ctx->priv; + int i; ff_framesync_uninit(&s->fs); - if (s->cu_module && s->hwctx) { + + if (s->hwctx) { CUcontext dummy; CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPushCurrent(s->cu_ctx)); - CHECK_CU(s->hwctx->internal->cuda_dl->cuModuleUnload(s->cu_module)); + for (i = 0; i < s->nb_atlases; i++) { + if (s->atlases[i].device_ptr) { + s->hwctx->internal->cuda_dl->cuMemFree(s->atlases[i].device_ptr); + s->atlases[i].device_ptr = 0; + } + } + s->nb_atlases = 0; + if (s->cu_module) { + CHECK_CU(s->hwctx->internal->cuda_dl->cuModuleUnload(s->cu_module)); + s->cu_module = NULL; + } CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); - s->cu_module = NULL; } +#if CONFIG_LIBFREETYPE + if (s->ft_face) { FT_Done_Face(s->ft_face); s->ft_face = NULL; } + if (s->ft_lib) { FT_Done_FreeType(s->ft_lib); s->ft_lib = NULL; } +#endif if (s->overlay_lock_inited) { pthread_mutex_destroy(&s->overlay_lock); s->overlay_lock_inited = 0; @@ -731,14 +1120,30 @@ static int cuda_grid_process_command(AVFilterContext *ctx, const char *cmd, } pthread_mutex_lock(&s->overlay_lock); ret = overlay_remove_locked(s, id); + /* Best-effort atlas GPU cleanup: cu ctx may not be pushed here, push it. */ + if (ret == 0 && s->hwctx) { + CUcontext dummy; + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPushCurrent(s->cu_ctx)); + atlas_remove_locked(s, id); + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); + } pthread_mutex_unlock(&s->overlay_lock); if (res) snprintf(res, res_len, ret == 0 ? "ok id=%s" : "not_found id=%s", id); return ret; } if (!strcmp(cmd, "clear_overlays")) { + int i; pthread_mutex_lock(&s->overlay_lock); s->nb_overlays = 0; + if (s->hwctx) { + CUcontext dummy; + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPushCurrent(s->cu_ctx)); + for (i = 0; i < s->nb_atlases; i++) + atlas_free_gpu_locked(s, &s->atlases[i]); + s->nb_atlases = 0; + CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy)); + } pthread_mutex_unlock(&s->overlay_lock); if (res) av_strlcpy(res, "ok", res_len); return 0; @@ -899,6 +1304,10 @@ static const AVOption cuda_grid_options[] = { OFFSET(out_width), AV_OPT_TYPE_INT, { .i64 = 1920 }, 16, 16384, FLAGS }, { "out_h", "высота output frame в пикселях", OFFSET(out_height), AV_OPT_TYPE_INT, { .i64 = 1080 }, 16, 16384, FLAGS }, + { "font_file", "TTF/OTF font path (default: search DejaVu/Liberation)", + OFFSET(font_file), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, FLAGS }, + { "font_size", "default text overlay size (px), используется если overlay не указал свой", + OFFSET(default_font_size), AV_OPT_TYPE_INT, { .i64 = 24 }, 6, 256, FLAGS }, { NULL } };