[FFmpeg-devel] [PATCH 1/1] fftools: Implemented text to bitmap subtitles!
Traian Coza
traian.coza at gmail.com
Tue May 3 21:31:14 EEST 2022
Used libass to render text-based subtitles in the AVSubtitle structure during transcoding.
This enables text to bitmap subtitle track conversion
This only works with --enable-libass. Otherwise, nothing should change.
Minor version bump required (field appended to AVCodecContext).
---
fftools/Makefile | 2 +
fftools/ffmpeg.c | 35 ++++-
fftools/text_to_bitmap.c | 303 +++++++++++++++++++++++++++++++++++++++
fftools/text_to_bitmap.h | 37 +++++
libavcodec/avcodec.h | 9 ++
5 files changed, 385 insertions(+), 1 deletion(-)
create mode 100644 fftools/text_to_bitmap.c
create mode 100644 fftools/text_to_bitmap.h
diff --git a/fftools/Makefile b/fftools/Makefile
index 81ad6c4f4f..f9334a5622 100644
--- a/fftools/Makefile
+++ b/fftools/Makefile
@@ -15,6 +15,8 @@ OBJS-ffmpeg += \
fftools/ffmpeg_mux.o \
fftools/ffmpeg_opt.o \
+OBJS-ffmpeg-$(CONFIG_LIBASS) += fftools/text_to_bitmap.o
+
define DOFFTOOL
OBJS-$(1) += fftools/cmdutils.o fftools/opt_common.o fftools/$(1).o $(OBJS-$(1)-yes)
$(1)$(PROGSSUF)_g$(EXESUF): $$(OBJS-$(1))
diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c
index a85ed18b08..60d613e35f 100644
--- a/fftools/ffmpeg.c
+++ b/fftools/ffmpeg.c
@@ -71,6 +71,10 @@
# include "libavfilter/buffersrc.h"
# include "libavfilter/buffersink.h"
+#if CONFIG_LIBASS
+#include "fftools/text_to_bitmap.h"
+#endif
+
#if HAVE_SYS_RESOURCE_H
#include <sys/time.h>
#include <sys/types.h>
@@ -2332,6 +2336,7 @@ static int transcode_subtitles(InputStream *ist, AVPacket *pkt, int *got_output,
int free_sub = 1;
int i, ret = avcodec_decode_subtitle2(ist->dec_ctx,
&subtitle, got_output, pkt);
+ int rendered = 0; // Variable for text to bitmap support
check_decode_result(NULL, got_output, ret);
@@ -2391,6 +2396,18 @@ static int transcode_subtitles(InputStream *ist, AVPacket *pkt, int *got_output,
|| ost->enc->type != AVMEDIA_TYPE_SUBTITLE)
continue;
+#if CONFIG_LIBASS
+ // Support text to bitmap
+ if (avcodec_descriptor_get(ost->enc_ctx->codec_id)->props & AV_CODEC_PROP_BITMAP_SUB)
+ if (avcodec_descriptor_get(ist->dec_ctx->codec_id)->props & AV_CODEC_PROP_TEXT_SUB)
+ if (!rendered) { // Make sure not to render twice
+ avpriv_render_avsub_ass(ist->dec_ctx->ass_context, &subtitle);
+ for (int r = 0; r < subtitle.num_rects; r++)
+ subtitle.rects[r]->type = SUBTITLE_BITMAP;
+ rendered = 1;
+ }
+#endif
+
do_subtitle_out(output_files[ost->file_index], ost, &subtitle);
}
@@ -2685,6 +2702,9 @@ static int init_input_stream(int ist_index, char *error, int error_len)
* audio, and video decoders such as cuvid or mediacodec */
ist->dec_ctx->pkt_timebase = ist->st->time_base;
+ // For text to bitmap rendering
+ ist->dec_ctx->ass_context = NULL;
+
if (!av_dict_get(ist->decoder_opts, "threads", NULL, 0))
av_dict_set(&ist->decoder_opts, "threads", "auto", 0);
/* Attached pics are sparse, therefore we would not want to delay their decoding till EOF. */
@@ -3213,12 +3233,21 @@ static int init_output_stream(OutputStream *ost, AVFrame *frame,
input_props = input_descriptor->props & (AV_CODEC_PROP_TEXT_SUB | AV_CODEC_PROP_BITMAP_SUB);
if (output_descriptor)
output_props = output_descriptor->props & (AV_CODEC_PROP_TEXT_SUB | AV_CODEC_PROP_BITMAP_SUB);
+#if CONFIG_LIBASS
+ if (input_props == AV_CODEC_PROP_BITMAP_SUB && output_props == AV_CODEC_PROP_TEXT_SUB) {
+ snprintf(error, error_len, "Subtitle encoding from bitmap to text currently not possible");
+ return AVERROR_INVALIDDATA;
+ }
+ if (input_props == AV_CODEC_PROP_TEXT_SUB && output_props == AV_CODEC_PROP_BITMAP_SUB)
+ avpriv_init_ass_context(ist, ost);
+#else
if (input_props && output_props && input_props != output_props) {
snprintf(error, error_len,
"Subtitle encoding currently only possible from text to text "
- "or bitmap to bitmap");
+ "or bitmap to bitmap (configure with --enable-libass for text to bitmap support)");
return AVERROR_INVALIDDATA;
}
+#endif
}
if ((ret = avcodec_open2(ost->enc_ctx, codec, &ost->encoder_opts)) < 0) {
@@ -4485,6 +4514,10 @@ static int transcode(void)
ist = input_streams[i];
if (ist->decoding_needed) {
avcodec_close(ist->dec_ctx);
+#if CONFIG_LIBASS
+ if (ist->dec_ctx->ass_context != NULL) // This really has to be done here, sorry
+ avpriv_free_ass_context(ist->dec_ctx->ass_context);
+#endif
if (ist->hwaccel_uninit)
ist->hwaccel_uninit(ist->dec_ctx);
}
diff --git a/fftools/text_to_bitmap.c b/fftools/text_to_bitmap.c
new file mode 100644
index 0000000000..20aa7201d8
--- /dev/null
+++ b/fftools/text_to_bitmap.c
@@ -0,0 +1,303 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * text to bitmap support code.
+ *
+ * This file contains a function to initiate the functionality for a stream,
+ * to render any AVSubtitle structure, and to free the structures allocated at the start
+ */
+
+#include "text_to_bitmap.h"
+
+#include "fftools/ffmpeg.h"
+#include "../libavcodec/avcodec.h"
+#include "../libavcodec/ass_split.h"
+
+/**
+ * Holds the objects used by the rendering function so they don't have to be reinitialized every time
+ */
+struct ASS_Context {
+ ASS_Library *library;
+ ASS_Renderer *renderer;
+ ASS_Track *track;
+ ASSSplitContext *ass_split_context;
+};
+
+// Copied from vf_subtitles.c
+/* libass supports a log level ranging from 0 to 7 */
+static const int ass_libavfilter_log_level_map[] = {
+ [0] = AV_LOG_FATAL, /* MSGL_FATAL */
+ [1] = AV_LOG_ERROR, /* MSGL_ERR */
+ [2] = AV_LOG_WARNING, /* MSGL_WARN */
+ [3] = AV_LOG_WARNING, /* <undefined> */
+ [4] = AV_LOG_INFO, /* MSGL_INFO */
+ [5] = AV_LOG_INFO, /* <undefined> */
+ [6] = AV_LOG_VERBOSE, /* MSGL_V */
+ [7] = AV_LOG_DEBUG, /* MSGL_DBG2 */
+};
+
+// Also copied
+static void ass_log(int ass_level, const char *fmt, va_list args, void *ctx)
+{
+ const int ass_level_clip = av_clip(ass_level, 0,
+ FF_ARRAY_ELEMS(ass_libavfilter_log_level_map) - 1);
+ const int level = ass_libavfilter_log_level_map[ass_level_clip];
+
+ av_vlog(NULL, level, fmt, args);
+ av_log(NULL, level, "\n");
+}
+
+/**
+ * Initiates the ASS_Context structure and adds it to the input stream decoder context.
+ * Does nothing if ist->dec_ctx->ass_context is already set.
+ * It needs all these arguments because it searches for a frame size in all the streams
+ * @param input_streams
+ * @param output_streams
+ * @param nb_input_streams
+ * @param nb_output_streams
+ * @param ist_i index of input stream for transcoding
+ * @param ost_i index of output stream for transcoding
+ */
+void avpriv_init_ass_context(InputStream *ist, OutputStream *ost)
+{
+ ASS_Context *context;
+ int width = 0, height = 0;
+ if (ist->dec_ctx->ass_context) return;
+
+ context = (ASS_Context *)malloc(sizeof(ASS_Context));
+ context->library = ass_library_init();
+ ass_set_message_cb(context->library, ass_log, NULL);
+ ass_set_extract_fonts(context->library, 1);
+ // TODO: ass_add_font(context->library, ...);
+
+ // Try to get a height and width from somewhere
+ do
+ {
+ // Try input stream
+ if (ost->enc_ctx->width != 0 && ost->enc_ctx->height != 0)
+ {
+ width = ost->enc_ctx->width;
+ height = ost->enc_ctx->height;
+ break;
+ }
+ // Try output stream
+ if (ist->dec_ctx->width != 0 && ist->dec_ctx->height != 0)
+ {
+ width = ist->dec_ctx->width;
+ height = ist->dec_ctx->height;
+ break;
+ }
+ // Try output streams
+ for (int j = 0; j < nb_output_streams; j++)
+ if (output_streams[j]->enc_ctx->codec_type == AVMEDIA_TYPE_VIDEO)
+ {
+ width = output_streams[j]->enc_ctx->width;
+ height = output_streams[j]->enc_ctx->height;
+ break;
+ }
+ if (width && height) break;
+ // Try input streams
+ for (int j = 0; j < nb_input_streams; j++)
+ if (input_streams[j]->dec_ctx->codec_type == AVMEDIA_TYPE_VIDEO)
+ {
+ width = input_streams[j]->dec_ctx->width;
+ height = input_streams[j]->dec_ctx->height;
+ break;
+ }
+ if (width && height) break;
+
+ av_log(NULL, AV_LOG_ERROR, "Cannot render text subtitle without frame size\n");
+ return;
+ } while (0);
+
+ context->renderer = ass_renderer_init(context->library);
+ ass_set_frame_size(context->renderer, width, height);
+ ass_set_pixel_aspect(context->renderer, 1);
+ ass_set_storage_size(context->renderer, width, height);
+ ass_set_shaper(context->renderer, 0);
+ ass_set_fonts(context->renderer, NULL, NULL, 1, NULL, 1);
+
+ context->track = ass_read_memory(context->library,
+ (char *)ist->dec_ctx->subtitle_header, ist->dec_ctx->subtitle_header_size, NULL);
+ context->ass_split_context = ff_ass_split((char *)ist->dec_ctx->subtitle_header);
+
+ ist->dec_ctx->ass_context = context;
+}
+
+/**
+ * Frees what was allocated in init_ass_context
+ * @param context
+ */
+void avpriv_free_ass_context(ASS_Context *context) {
+ ass_library_done(context->library);
+ ass_renderer_done(context->renderer);
+ ass_free_track(context->track);
+ ff_ass_split_free(context->ass_split_context);
+ free(context);
+}
+
+/* libass stores an RGBA color in the format RRGGBBTT, where TT is the transparency level */
+#define AR(c) ((c)>>24)
+#define AG(c) (((c)>>16)&0xFF)
+#define AB(c) (((c)>>8) &0xFF)
+#define AA(c) ((0xFF-(c)) &0xFF)
+
+#define ALPHA_THRESHOLD 0b10000000
+
+/**
+ * Renders the AVSubtitle and sets the bitmap data for each AVSubtitleRect
+ * @param context
+ * @param sub
+ */
+void avpriv_render_avsub_ass(ASS_Context *context, AVSubtitle *sub)
+{
+ ASSDialog *dialog;
+ ASS_Image *image;
+ ASS_Track *track = context->track;
+ for (int r = 0; r < sub->num_rects; r++)
+ {
+ AVSubtitleRect *rect = sub->rects[r];
+ if (rect->data[0]) continue;
+
+ dialog = ff_ass_split_dialog(context->ass_split_context, rect->ass);
+ if (track->n_events > 0)
+ ass_free_event(track, 0);
+ track->n_events = 0;
+ ass_alloc_event(track);
+ track->n_events = track->max_events = 1;
+ track->events[0].Start = sub->start_display_time + sub->pts / (AV_TIME_BASE / 1000);
+ track->events[0].Duration = sub->end_display_time - sub->start_display_time;
+ track->events[0].Effect = strdup(dialog->effect);
+ track->events[0].Layer = dialog->layer;
+ track->events[0].MarginL = dialog->margin_l;
+ track->events[0].MarginR = dialog->margin_r;
+ track->events[0].MarginV = dialog->margin_v;
+ track->events[0].Name = strdup(dialog->name);
+ track->events[0].Text = strdup(dialog->text);
+ track->events[0].ReadOrder = dialog->readorder;
+ track->events[0].Style = 0;
+ for (int style = 0; style < track->n_styles; style++)
+ if (!strcmp(track->styles[style].Name, dialog->style))
+ track->events[0].Style = style;
+ track->events[0].render_priv = NULL;
+ ff_ass_free_dialog(&dialog);
+
+ image = ass_render_frame(context->renderer, track,
+ track->events[0].Start + track->events[0].Duration / 2, NULL);
+ if (image == NULL) av_log(NULL, AV_LOG_WARNING,
+ "failed to render ass: %s\n", rect->ass);
+
+ rect->x = image ? image->dst_x : 0; rect->w = 0;
+ rect->y = image ? image->dst_y : 0; rect->h = 0;
+ rect->nb_colors = 1; // Transparent background counts as a color
+ for (ASS_Image *img = image; img != NULL; img = img->next)
+ {
+ // Set image bounds to encompass all images
+ if (img->dst_x < rect->x) rect->x = img->dst_x;
+ if (img->dst_y < rect->y) rect->y = img->dst_y;
+ if (img->dst_x + img->w > rect->x + rect->w)
+ rect->w = img->dst_x + img->w - rect->x;
+ if (img->dst_y + img->h > rect->y + rect->h)
+ rect->h = img->dst_y + img->h - rect->y;
+ rect->nb_colors++;
+ }
+ rect->linesize[0] = rect->w;
+ rect->data[0] = (uint8_t *)malloc(rect->w * rect->h * sizeof(uint8_t));
+ rect->data[1] = (uint8_t *)malloc(4 * rect->nb_colors * sizeof(uint8_t));
+ memset(rect->data[0], 0, rect->w * rect->h); // Set all to transparent
+ memset(rect->data[1], 0, 4); // Set transparent color
+ memset(&rect->linesize[1], 0, 3 * sizeof(int));
+ rect->data[2] = rect->data[3] = NULL;
+ for (int color = 1; image != NULL; image = image->next, color++)
+ {
+ // Set color
+ rect->data[1][4 * color + 0] = AR(image->color);
+ rect->data[1][4 * color + 1] = AG(image->color);
+ rect->data[1][4 * color + 2] = AB(image->color);
+ rect->data[1][4 * color + 3] = AA(image->color);
+ // Set pixels
+ for (int y = 0; y < image->h; y++)
+ for (int x = 0; x < image->w; x++)
+ if (image->bitmap[y * image->stride + x] >= ALPHA_THRESHOLD)
+ {
+ int x_rect = image->dst_x + x - rect->x;
+ int y_rect = image->dst_y + y - rect->y;
+ rect->data[0][y_rect * rect->w + x_rect] = color;
+ }
+ }
+ }
+}
+
+/*
+static void print_ass_image(const ASS_Image *image)
+{
+ int index = 0;
+ for (; image != NULL; image = image->next, index++)
+ {
+ printf("index: %d\n", index);
+ printf("image->(dst_x,dst_y): (%d,%d)\n", image->dst_x, image->dst_y);
+ printf("image->(w,h): (%d,%d)\n", image->w, image->h);
+ printf("image->stride: %d\n", image->stride);
+ printf("image->type: %d\n", image->type);
+ printf("image->color: [%u,%u,%u,%u]\n", AR(image->color), AG(image->color), AB(image->color), AA(image->color));
+ for (int y = 0; y < image->h; y++, printf("\n"))
+ for (int x = 0; x < image->w; x++)
+ printf("%02X", image->bitmap[y * image->stride + x]);
+ printf("\n");
+ }
+}
+
+static void print_subtitle(AVSubtitle sub)
+{
+ printf("sub.format: %u\n", sub.format);
+ printf("sub.start_display_time: %u\n", sub.start_display_time);
+ printf("sub.end_display_time: %u\n", sub.end_display_time);
+ printf("sub.num_rects: %u\n", sub.num_rects);
+ printf("sub.pts: %ld\n", sub.pts);
+ for (int i = 0; i < sub.num_rects; i++)
+ {
+ printf("sub.rects[%d]->type: %d\n", i, sub.rects[i]->type);
+ printf("sub.rects[%d]->nb_colors: %d\n", i, sub.rects[i]->nb_colors);
+ printf("sub.rects[%d]->(x,y,w,h): (%d,%d,%d,%d)\n", i, sub.rects[i]->x, sub.rects[i]->y, sub.rects[i]->w, sub.rects[i]->h);
+ printf("sub.rects[%d]->linesize: [%d,%d,%d,%d]\n", i, sub.rects[i]->linesize[0], sub.rects[i]->linesize[1], sub.rects[i]->linesize[1], sub.rects[i]->linesize[1]);
+ switch (sub.rects[i]->type)
+ {
+ case SUBTITLE_TEXT:
+ printf("sub.rects[%d]->text: %s\n", i, sub.rects[i]->text);
+ break;
+ case SUBTITLE_ASS:
+ printf("sub.rects[%d]->ass: %s\n", i, sub.rects[i]->ass);
+ break;
+ case SUBTITLE_BITMAP:
+ for (int c = 0; c < sub.rects[i]->nb_colors; c++)
+ printf("color %d: [%u,%u,%u,%u]\n", c,
+ sub.rects[i]->data[1][c * 4 + 0],
+ sub.rects[i]->data[1][c * 4 + 1],
+ sub.rects[i]->data[1][c * 4 + 2],
+ sub.rects[i]->data[1][c * 4 + 3]);
+ for (int y = 0; y < sub.rects[i]->h; y++, printf("\n"))
+ for (int x = 0; x < sub.rects[i]->w; x++)
+ printf("%d", sub.rects[i]->data[0][y * sub.rects[i]->w + x]);
+ break;
+ }
+ }
+ printf("\n");
+}
+*/
\ No newline at end of file
diff --git a/fftools/text_to_bitmap.h b/fftools/text_to_bitmap.h
new file mode 100644
index 0000000000..4b30f8c297
--- /dev/null
+++ b/fftools/text_to_bitmap.h
@@ -0,0 +1,37 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * text_to_bitmap header file
+ */
+
+#ifndef FFTOOLS_TEXT_TO_BITMAP_H
+#define FFTOOLS_TEXT_TO_BITMAP_H
+
+#include <ass/ass.h>
+#include "fftools/ffmpeg.h"
+
+struct ASS_Context;
+typedef struct ASS_Context ASS_Context;
+
+void avpriv_init_ass_context(InputStream *ist, OutputStream *ost);
+void avpriv_render_avsub_ass(ASS_Context *context, AVSubtitle *sub);
+void avpriv_free_ass_context(ASS_Context *context);
+
+#endif //FFTOOLS_TEXT_TO_BITMAP_H
diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
index 4dae23d06e..530c01f193 100644
--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@@ -2055,6 +2055,15 @@ typedef struct AVCodecContext {
* The decoder can then override during decoding as needed.
*/
AVChannelLayout ch_layout;
+
+ /**
+ * Pointer to ASS_Library instance (cast to void *)
+ * - encoding: unused
+ * - decoding: is set to NULL initially in init_input_stream, and if
+ * there must be a text to bitmap conversion, is set to a
+ * new instance of ASS_Library in init_output_stream
+ */
+ void *ass_context;
} AVCodecContext;
/**
--
2.34.1
More information about the ffmpeg-devel
mailing list