vf_cuda_grid: Phase 4b-3 — text overlay (freetype + RGBA atlas)

CPU rasterization через freetype (DejaVu/Liberation auto-detect или
font_file= option), upload pitched RGBA buffer to GPU, blit через
Alpha_Blit_RGBA_Y/UV kernels (Phase 4b-2 уже had).

Cache:
  per-overlay-id atlas с keys (text, font_size, r/g/b) — re-rasterize
  только при change. Cleanup при remove_overlay/clear_overlays/uninit.

Options:
  font_file=  — TTF path (default: search DejaVu/Liberation)
  font_size=  — default size if overlay не указал свой

Wire format: add_overlay <id> text x=.. y=.. text=hello font_size=24 r=255 g=255 b=255 opacity=200

Conditional на CONFIG_LIBFREETYPE — без него text overlays no-op
(остальные типы работают как обычно).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
gx
2026-05-19 22:30:36 +01:00
parent 1e54f04e24
commit 4010461300
+413 -4
View File
@@ -31,6 +31,8 @@
#include "config_components.h"
#include <unistd.h>
#include "libavutil/avstring.h"
#include "libavutil/common.h"
#include "libavutil/cuda_check.h"
@@ -42,6 +44,12 @@
#include "libavutil/pixdesc.h"
#include "libavutil/thread.h"
#include "config.h"
#if CONFIG_LIBFREETYPE
#include <ft2build.h>
#include FT_FREETYPE_H
#endif
#include "avfilter.h"
#include "cuda/load_helper.h"
#include "filters.h"
@@ -183,6 +191,17 @@ static const LayoutTemplate *find_layout(const char *name)
/* ─── Filter state ─────────────────────────────────────────────────────── */
/* Text atlas — rasterized RGBA buffer on GPU, keyed by overlay id (Phase 4b-3). */
typedef struct TextAtlas {
char id[OVERLAY_ID_MAX];
char cached_text[OVERLAY_TEXT_MAX];
int cached_font_size;
uint8_t cached_r, cached_g, cached_b;
CUdeviceptr device_ptr;
size_t device_pitch;
int w, h;
} TextAtlas;
typedef struct CudaGridContext {
const AVClass *class;
@@ -190,6 +209,8 @@ typedef struct CudaGridContext {
char *layout_name;
int out_width;
int out_height;
char *font_file;
int default_font_size;
/* Resolved layout (после init) */
const LayoutTemplate *layout;
@@ -218,6 +239,16 @@ typedef struct CudaGridContext {
CUfunction cu_func_alpha_fill_uv;
CUfunction cu_func_alpha_blit_rgba_y;
CUfunction cu_func_alpha_blit_rgba_uv;
/* Text rendering (Phase 4b-3, libfreetype optional) */
#if CONFIG_LIBFREETYPE
FT_Library ft_lib;
FT_Face ft_face;
int ft_ready; /* 1 once library + face loaded */
int ft_init_attempted; /* set on first attempt, success or fail */
#endif
TextAtlas atlases[MAX_OVERLAYS];
int nb_atlases;
} CudaGridContext;
/* ─── Composition: copy одного input plane в target region output ──────── */
@@ -456,6 +487,344 @@ static int render_strip_alpha(AVFilterContext *ctx, AVFrame *out,
return 0;
}
/* ─── Text rendering (Phase 4b-3, freetype) ────────────────────────────── */
#if CONFIG_LIBFREETYPE
static const char *const default_fonts[] = {
"/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
"/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf",
"/usr/share/fonts/TTF/DejaVuSans-Bold.ttf",
"/usr/share/fonts/dejavu/DejaVuSans-Bold.ttf",
NULL,
};
static int ensure_ft_loaded(AVFilterContext *ctx)
{
CudaGridContext *s = ctx->priv;
const char *path = s->font_file;
FT_Error err;
int i;
if (s->ft_ready) return 0;
if (s->ft_init_attempted) return AVERROR(ENOSYS);
s->ft_init_attempted = 1;
err = FT_Init_FreeType(&s->ft_lib);
if (err) {
av_log(ctx, AV_LOG_WARNING, "FT_Init_FreeType failed (err=%d), text overlays disabled\n", err);
return AVERROR(ENOSYS);
}
if (!path) {
for (i = 0; default_fonts[i]; i++) {
if (access(default_fonts[i], R_OK) == 0) {
path = default_fonts[i];
break;
}
}
}
if (!path) {
av_log(ctx, AV_LOG_WARNING, "no font found (set font_file= option), text disabled\n");
FT_Done_FreeType(s->ft_lib);
s->ft_lib = NULL;
return AVERROR(ENOENT);
}
err = FT_New_Face(s->ft_lib, path, 0, &s->ft_face);
if (err) {
av_log(ctx, AV_LOG_WARNING, "FT_New_Face(%s) failed: %d, text disabled\n", path, err);
FT_Done_FreeType(s->ft_lib);
s->ft_lib = NULL;
return AVERROR(EIO);
}
av_log(ctx, AV_LOG_INFO, "freetype: loaded %s\n", path);
s->ft_ready = 1;
return 0;
}
/* Two-pass rasterize: 1) measure total_w / ascent / descent, 2) blit to RGBA buffer.
* Returns malloc'd RGBA buffer (caller frees) + dims via out params, or NULL. */
static uint8_t *rasterize_text_rgba(AVFilterContext *ctx, const char *text, int font_size,
uint8_t r, uint8_t g, uint8_t b,
int *out_w, int *out_h)
{
CudaGridContext *s = ctx->priv;
FT_Error err;
const unsigned char *p;
int total_w = 0, ascent_px = 0, descent_px = 0;
int atlas_w, atlas_h, pen_x, baseline_y;
uint8_t *atlas;
if (!s->ft_ready) return NULL;
err = FT_Set_Pixel_Sizes(s->ft_face, 0, font_size);
if (err) {
av_log(ctx, AV_LOG_WARNING, "FT_Set_Pixel_Sizes failed: %d\n", err);
return NULL;
}
/* Pass 1: measure */
for (p = (const unsigned char *)text; *p; p++) {
FT_UInt gi;
FT_GlyphSlot g_slot;
gi = FT_Get_Char_Index(s->ft_face, *p);
if (!gi) continue;
if (FT_Load_Glyph(s->ft_face, gi, FT_LOAD_DEFAULT)) continue;
g_slot = s->ft_face->glyph;
total_w += (g_slot->advance.x >> 6);
ascent_px = FFMAX(ascent_px, g_slot->bitmap_top);
descent_px = FFMAX(descent_px, (int)g_slot->bitmap.rows - g_slot->bitmap_top);
}
if (total_w <= 0) return NULL;
atlas_w = total_w + 4; /* padding */
atlas_h = ascent_px + descent_px + 4;
if (atlas_h <= 0) atlas_h = font_size;
atlas = av_mallocz((size_t)atlas_w * atlas_h * 4);
if (!atlas) return NULL;
/* Pass 2: blit */
pen_x = 2;
baseline_y = ascent_px + 2;
for (p = (const unsigned char *)text; *p; p++) {
FT_UInt gi;
FT_GlyphSlot g_slot;
int gx, gy, bw, bh;
int yy, xx;
gi = FT_Get_Char_Index(s->ft_face, *p);
if (!gi) continue;
if (FT_Load_Glyph(s->ft_face, gi, FT_LOAD_RENDER)) continue;
g_slot = s->ft_face->glyph;
if (g_slot->bitmap.pixel_mode != FT_PIXEL_MODE_GRAY) {
pen_x += g_slot->advance.x >> 6;
continue;
}
bw = g_slot->bitmap.width;
bh = g_slot->bitmap.rows;
gx = pen_x + g_slot->bitmap_left;
gy = baseline_y - g_slot->bitmap_top;
for (yy = 0; yy < bh; yy++) {
int dy = gy + yy;
if (dy < 0 || dy >= atlas_h) continue;
for (xx = 0; xx < bw; xx++) {
int dx = gx + xx;
uint8_t gray;
uint8_t *dst;
if (dx < 0 || dx >= atlas_w) continue;
gray = g_slot->bitmap.buffer[yy * g_slot->bitmap.pitch + xx];
if (gray == 0) continue;
dst = atlas + ((size_t)dy * atlas_w + dx) * 4;
/* Premultiplied: max() — text "fills" rather than blending overlaps */
if (gray > dst[3]) {
dst[0] = r;
dst[1] = g;
dst[2] = b;
dst[3] = gray;
}
}
}
pen_x += g_slot->advance.x >> 6;
}
*out_w = atlas_w;
*out_h = atlas_h;
return atlas;
}
#endif /* CONFIG_LIBFREETYPE */
/* Cache lookup — locked by caller. */
static TextAtlas *atlas_find_locked(CudaGridContext *s, const char *id)
{
int i;
for (i = 0; i < s->nb_atlases; i++)
if (!strcmp(s->atlases[i].id, id))
return &s->atlases[i];
return NULL;
}
static void atlas_free_gpu_locked(CudaGridContext *s, TextAtlas *a)
{
if (a->device_ptr) {
s->hwctx->internal->cuda_dl->cuMemFree(a->device_ptr);
a->device_ptr = 0;
}
}
static void atlas_remove_locked(CudaGridContext *s, const char *id)
{
int i;
for (i = 0; i < s->nb_atlases; i++) {
if (!strcmp(s->atlases[i].id, id)) {
atlas_free_gpu_locked(s, &s->atlases[i]);
memmove(&s->atlases[i], &s->atlases[i + 1],
(s->nb_atlases - i - 1) * sizeof(TextAtlas));
s->nb_atlases--;
return;
}
}
}
#if CONFIG_LIBFREETYPE
/* Ensure GPU atlas for given overlay matches its text/font_size/color. cu ctx pushed by caller. */
static int ensure_text_atlas(AVFilterContext *ctx, const GridOverlay *ov, TextAtlas **out_atlas)
{
CudaGridContext *s = ctx->priv;
TextAtlas *a;
uint8_t *cpu_atlas;
int aw, ah;
int font_size;
CUdeviceptr dev_ptr = 0;
size_t dev_pitch = 0;
CUDA_MEMCPY2D cpy = { 0 };
int ret;
font_size = ov->u.text.font_size > 0 ? ov->u.text.font_size : s->default_font_size;
a = atlas_find_locked(s, ov->id);
if (a &&
!strcmp(a->cached_text, ov->u.text.text) &&
a->cached_font_size == font_size &&
a->cached_r == ov->u.text.r &&
a->cached_g == ov->u.text.g &&
a->cached_b == ov->u.text.b) {
*out_atlas = a;
return 0;
}
if (s->nb_atlases >= MAX_OVERLAYS && !a)
return AVERROR(ENOSPC);
ret = ensure_ft_loaded(ctx);
if (ret < 0) return ret;
cpu_atlas = rasterize_text_rgba(ctx, ov->u.text.text, font_size,
ov->u.text.r, ov->u.text.g, ov->u.text.b,
&aw, &ah);
if (!cpu_atlas) return AVERROR(EINVAL);
/* Allocate pitched device buffer + upload */
ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuMemAllocPitch(
&dev_ptr, &dev_pitch, (size_t)aw * 4, (size_t)ah, 4));
if (ret < 0) { av_free(cpu_atlas); return ret; }
cpy.srcMemoryType = CU_MEMORYTYPE_HOST;
cpy.srcHost = cpu_atlas;
cpy.srcPitch = (size_t)aw * 4;
cpy.dstMemoryType = CU_MEMORYTYPE_DEVICE;
cpy.dstDevice = dev_ptr;
cpy.dstPitch = dev_pitch;
cpy.WidthInBytes = (size_t)aw * 4;
cpy.Height = (size_t)ah;
ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuMemcpy2D(&cpy));
av_free(cpu_atlas);
if (ret < 0) {
s->hwctx->internal->cuda_dl->cuMemFree(dev_ptr);
return ret;
}
if (a) {
atlas_free_gpu_locked(s, a);
} else {
a = &s->atlases[s->nb_atlases++];
memset(a, 0, sizeof(*a));
av_strlcpy(a->id, ov->id, sizeof(a->id));
}
av_strlcpy(a->cached_text, ov->u.text.text, sizeof(a->cached_text));
a->cached_font_size = font_size;
a->cached_r = ov->u.text.r;
a->cached_g = ov->u.text.g;
a->cached_b = ov->u.text.b;
a->device_ptr = dev_ptr;
a->device_pitch = dev_pitch;
a->w = aw;
a->h = ah;
*out_atlas = a;
return 0;
}
static int render_overlay_text(AVFilterContext *ctx, AVFrame *out, const GridOverlay *ov)
{
CudaGridContext *s = ctx->priv;
TextAtlas *a = NULL;
int px, py;
int base_x, base_y, base_w, base_h;
CUdeviceptr dst_y, dst_uv;
int ret;
int aw, ah, ap, ealpha;
if (!ov->u.text.text[0]) return 0;
ret = ensure_text_atlas(ctx, ov, &a);
if (ret < 0) {
av_log(ctx, AV_LOG_WARNING, "text overlay %s: atlas failed (ret=%d)\n", ov->id, ret);
return 0; /* non-fatal — skip */
}
if (!a) return 0;
/* Position: x,y normalized within cell or output (top-left corner of atlas). */
if (ov->cell < 0) {
base_x = 0; base_y = 0;
base_w = s->out_width; base_h = s->out_height;
} else if (ov->cell < s->layout->nb_cells) {
base_x = s->cell_px[ov->cell].x;
base_y = s->cell_px[ov->cell].y;
base_w = s->cell_px[ov->cell].w;
base_h = s->cell_px[ov->cell].h;
} else {
return 0;
}
px = base_x + (int)(ov->x * base_w);
py = base_y + (int)(ov->y * base_h);
px &= ~1; py &= ~1;
/* Clip — if atlas runs past output bounds, kernels still safe (they bounds-check
* against atlas w/h, not dst). But avoid going off output: simple skip when fully outside. */
if (px >= s->out_width || py >= s->out_height) return 0;
aw = a->w; ah = a->h;
/* Reduce ah if it overflows output (avoid OOB on Y plane); UV plane uses h/2 so same scale. */
if (px + aw > s->out_width) aw = s->out_width - px;
if (py + ah > s->out_height) ah = s->out_height - py;
aw &= ~1; ah &= ~1;
if (aw <= 0 || ah <= 0) return 0;
ap = (int)a->device_pitch;
ealpha = ov->opacity;
/* Y plane */
dst_y = (CUdeviceptr)out->data[0];
{
int dst_pitch_y = out->linesize[0];
CUdeviceptr atl = a->device_ptr;
void *args[] = { &dst_y, &dst_pitch_y, &px, &py, &atl, &ap, &aw, &ah, &ealpha };
ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuLaunchKernel(
s->cu_func_alpha_blit_rgba_y,
DIV_UP(aw, BLOCKX), DIV_UP(ah, BLOCKY), 1,
BLOCKX, BLOCKY, 1,
0, s->cu_stream, args, NULL));
if (ret < 0) return ret;
}
/* UV plane (kernel does internal half-res subsampling) */
dst_uv = (CUdeviceptr)out->data[1];
{
int dst_pitch_uv = out->linesize[1];
CUdeviceptr atl = a->device_ptr;
void *args[] = { &dst_uv, &dst_pitch_uv, &px, &py, &atl, &ap, &aw, &ah, &ealpha };
ret = CHECK_CU(s->hwctx->internal->cuda_dl->cuLaunchKernel(
s->cu_func_alpha_blit_rgba_uv,
DIV_UP(aw / 2, BLOCKX), DIV_UP(ah / 2, BLOCKY), 1,
BLOCKX, BLOCKY, 1,
0, s->cu_stream, args, NULL));
if (ret < 0) return ret;
}
return 0;
}
#endif /* CONFIG_LIBFREETYPE */
static int render_overlay_rect(AVFilterContext *ctx, AVFrame *out, const GridOverlay *ov)
{
CudaGridContext *s = ctx->priv;
@@ -538,7 +907,12 @@ static int render_overlays(AVFilterContext *ctx, AVFrame *out)
if (ret < 0) return ret;
break;
case OV_TYPE_TEXT:
av_log(ctx, AV_LOG_TRACE, "overlay %s: text — Phase 4b-3 (freetype)\n", ov->id);
#if CONFIG_LIBFREETYPE
ret = render_overlay_text(ctx, out, ov);
if (ret < 0) return ret;
#else
av_log(ctx, AV_LOG_TRACE, "overlay %s: text — libfreetype disabled at build\n", ov->id);
#endif
break;
case OV_TYPE_ICON:
av_log(ctx, AV_LOG_TRACE, "overlay %s: icon — Phase 4b-4 (sprite)\n", ov->id);
@@ -688,14 +1062,29 @@ static av_cold int cuda_grid_init(AVFilterContext *ctx)
static av_cold void cuda_grid_uninit(AVFilterContext *ctx)
{
CudaGridContext *s = ctx->priv;
int i;
ff_framesync_uninit(&s->fs);
if (s->cu_module && s->hwctx) {
if (s->hwctx) {
CUcontext dummy;
CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPushCurrent(s->cu_ctx));
CHECK_CU(s->hwctx->internal->cuda_dl->cuModuleUnload(s->cu_module));
for (i = 0; i < s->nb_atlases; i++) {
if (s->atlases[i].device_ptr) {
s->hwctx->internal->cuda_dl->cuMemFree(s->atlases[i].device_ptr);
s->atlases[i].device_ptr = 0;
}
}
s->nb_atlases = 0;
if (s->cu_module) {
CHECK_CU(s->hwctx->internal->cuda_dl->cuModuleUnload(s->cu_module));
s->cu_module = NULL;
}
CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
s->cu_module = NULL;
}
#if CONFIG_LIBFREETYPE
if (s->ft_face) { FT_Done_Face(s->ft_face); s->ft_face = NULL; }
if (s->ft_lib) { FT_Done_FreeType(s->ft_lib); s->ft_lib = NULL; }
#endif
if (s->overlay_lock_inited) {
pthread_mutex_destroy(&s->overlay_lock);
s->overlay_lock_inited = 0;
@@ -731,14 +1120,30 @@ static int cuda_grid_process_command(AVFilterContext *ctx, const char *cmd,
}
pthread_mutex_lock(&s->overlay_lock);
ret = overlay_remove_locked(s, id);
/* Best-effort atlas GPU cleanup: cu ctx may not be pushed here, push it. */
if (ret == 0 && s->hwctx) {
CUcontext dummy;
CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPushCurrent(s->cu_ctx));
atlas_remove_locked(s, id);
CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
}
pthread_mutex_unlock(&s->overlay_lock);
if (res) snprintf(res, res_len, ret == 0 ? "ok id=%s" : "not_found id=%s", id);
return ret;
}
if (!strcmp(cmd, "clear_overlays")) {
int i;
pthread_mutex_lock(&s->overlay_lock);
s->nb_overlays = 0;
if (s->hwctx) {
CUcontext dummy;
CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPushCurrent(s->cu_ctx));
for (i = 0; i < s->nb_atlases; i++)
atlas_free_gpu_locked(s, &s->atlases[i]);
s->nb_atlases = 0;
CHECK_CU(s->hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
}
pthread_mutex_unlock(&s->overlay_lock);
if (res) av_strlcpy(res, "ok", res_len);
return 0;
@@ -899,6 +1304,10 @@ static const AVOption cuda_grid_options[] = {
OFFSET(out_width), AV_OPT_TYPE_INT, { .i64 = 1920 }, 16, 16384, FLAGS },
{ "out_h", "высота output frame в пикселях",
OFFSET(out_height), AV_OPT_TYPE_INT, { .i64 = 1080 }, 16, 16384, FLAGS },
{ "font_file", "TTF/OTF font path (default: search DejaVu/Liberation)",
OFFSET(font_file), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, FLAGS },
{ "font_size", "default text overlay size (px), используется если overlay не указал свой",
OFFSET(default_font_size), AV_OPT_TYPE_INT, { .i64 = 24 }, 6, 256, FLAGS },
{ NULL }
};