[FFmpeg-devel] [PATCH] Add SRV3 decoder/demuxer

Hubert Głuchowski fishhh at fishhh.dev
Sun Dec 15 19:48:15 EET 2024


This commit adds preliminary support for decoding the SRV3 subtitle format.
SRV3 is the internal format YouTube uses for their captions. Supporting it
in ffmpeg allows video players to play a significant subset of SRV3
mostly correctly by converting it to ASS.
Currently the following features are unsupported:
- Vertical text
- Scrolling text
- Ruby text
- Background box support is janky
These issues are mostly due to limitations of the ASSv3 format.
---
This is my first time interacting with the ffmpeg-devel mailing list so
please bear with me, I've been sitting on these changes for almost a
year and only now managed to kind of overcome the intimidating nature of
ffmpeg-devel.

At first it seemed to me like the demuxer should take care of parsing
the subtitle file so I did it this way and added opaque side data that
contains pointers to an internal representation of SRV3 metadata. I don't
know whether this is the right approach though, please correct me if it
isn't.

I haven't added tests since I haven't looked into how that would be
done, but I've been using it in my mpv build for almost a year now and
it seems to work fine.
Although as if specifically to inconvenience me libass appears to have
introduced what seems to be a bug into their background rendering that I
just discovered as I'm writing this. I don't think this patch is at
fault though.

 configure                |   2 +
 libavcodec/Makefile      |   1 +
 libavcodec/allcodecs.c   |   1 +
 libavcodec/codec_desc.c  |   7 +
 libavcodec/codec_id.h    |   1 +
 libavcodec/packet.c      |   2 +
 libavcodec/packet.h      |  12 +
 libavcodec/srv3dec.c     | 260 +++++++++++++++++++
 libavformat/Makefile     |   1 +
 libavformat/allformats.c |   1 +
 libavformat/srv3.h       |  95 +++++++
 libavformat/srv3dec.c    | 542 +++++++++++++++++++++++++++++++++++++++
 12 files changed, 925 insertions(+)
 create mode 100644 libavcodec/srv3dec.c
 create mode 100644 libavformat/srv3.h
 create mode 100644 libavformat/srv3dec.c

diff --git a/configure b/configure
index bf55ba67fa..a61333a93d 100755
--- a/configure
+++ b/configure
@@ -3724,6 +3724,8 @@ wtv_demuxer_select="mpegts_demuxer riffdec"
 wtv_muxer_select="mpegts_muxer riffenc"
 xmv_demuxer_select="riffdec"
 xwma_demuxer_select="riffdec"
+srv3_demuxer_deps="libxml2"
+srv3_demuxer_select="srv3dec"
 
 # indevs / outdevs
 android_camera_indev_deps="android camera2ndk mediandk pthreads"
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index c946444175..a89b5c27f2 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -707,6 +707,7 @@ OBJS-$(CONFIG_SP5X_DECODER)            += sp5xdec.o
 OBJS-$(CONFIG_SRGC_DECODER)            += mscc.o
 OBJS-$(CONFIG_SRT_DECODER)             += srtdec.o ass.o htmlsubtitles.o
 OBJS-$(CONFIG_SRT_ENCODER)             += srtenc.o ass_split.o
+OBJS-$(CONFIG_SRV3_DECODER)            += srv3dec.o ass.o
 OBJS-$(CONFIG_STL_DECODER)             += textdec.o ass.o
 OBJS-$(CONFIG_SUBRIP_DECODER)          += srtdec.o ass.o htmlsubtitles.o
 OBJS-$(CONFIG_SUBRIP_ENCODER)          += srtenc.o ass_split.o
diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
index 0b559dfc58..7bb2a4170d 100644
--- a/libavcodec/allcodecs.c
+++ b/libavcodec/allcodecs.c
@@ -738,6 +738,7 @@ extern const FFCodec ff_webvtt_encoder;
 extern const FFCodec ff_webvtt_decoder;
 extern const FFCodec ff_xsub_encoder;
 extern const FFCodec ff_xsub_decoder;
+extern const FFCodec ff_srv3_decoder;
 
 /* external libraries */
 extern const FFCodec ff_aac_at_encoder;
diff --git a/libavcodec/codec_desc.c b/libavcodec/codec_desc.c
index bc9163bf98..2832e817b5 100644
--- a/libavcodec/codec_desc.c
+++ b/libavcodec/codec_desc.c
@@ -3634,6 +3634,13 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .long_name = NULL_IF_CONFIG_SMALL("ARIB STD-B24 caption"),
         .profiles  = NULL_IF_CONFIG_SMALL(ff_arib_caption_profiles),
     },
+    {
+        .id        = AV_CODEC_ID_SRV3,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "srv3",
+        .long_name = NULL_IF_CONFIG_SMALL("SRV3 subtitle"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
 
     /* other kind of codecs and pseudo-codecs */
     {
diff --git a/libavcodec/codec_id.h b/libavcodec/codec_id.h
index 6bfaa02601..774de43f4d 100644
--- a/libavcodec/codec_id.h
+++ b/libavcodec/codec_id.h
@@ -579,6 +579,7 @@ enum AVCodecID {
     AV_CODEC_ID_HDMV_TEXT_SUBTITLE,
     AV_CODEC_ID_TTML,
     AV_CODEC_ID_ARIB_CAPTION,
+    AV_CODEC_ID_SRV3,
 
     /* other specific kind of codecs (generally used for attachments) */
     AV_CODEC_ID_FIRST_UNKNOWN = 0x18000,           ///< A dummy ID pointing at the start of various fake codecs.
diff --git a/libavcodec/packet.c b/libavcodec/packet.c
index 5104eb98b1..c6425c8c1d 100644
--- a/libavcodec/packet.c
+++ b/libavcodec/packet.c
@@ -288,6 +288,8 @@ const char *av_packet_side_data_name(enum AVPacketSideDataType type)
     case AV_PKT_DATA_MATROSKA_BLOCKADDITIONAL:   return "Matroska BlockAdditional";
     case AV_PKT_DATA_WEBVTT_IDENTIFIER:          return "WebVTT ID";
     case AV_PKT_DATA_WEBVTT_SETTINGS:            return "WebVTT Settings";
+    case AV_PKT_DATA_SRV3_HEAD:                  return "SRV3 Head";
+    case AV_PKT_DATA_SRV3_EVENT:                 return "SRV3 Event metatada";
     case AV_PKT_DATA_METADATA_UPDATE:            return "Metadata Update";
     case AV_PKT_DATA_MPEGTS_STREAM_ID:           return "MPEGTS Stream ID";
     case AV_PKT_DATA_MASTERING_DISPLAY_METADATA: return "Mastering display metadata";
diff --git a/libavcodec/packet.h b/libavcodec/packet.h
index c1f1ad7b43..d3ccb97809 100644
--- a/libavcodec/packet.h
+++ b/libavcodec/packet.h
@@ -345,6 +345,18 @@ enum AVPacketSideDataType {
      */
     AV_PKT_DATA_LCEVC,
 
+    /**
+     * SRV3 subtitle header.
+     * Not part of public API, do not rely on its existance or layout.
+     */
+    AV_PKT_DATA_SRV3_HEAD,
+
+    /**
+     * SRV3 metadata associated with a single subtitle event.
+     * Not part of public API, do not rely on its existance or layout.
+     */
+    AV_PKT_DATA_SRV3_EVENT,
+
     /**
      * The number of side data types.
      * This is not part of the public API/ABI in the sense that it may
diff --git a/libavcodec/srv3dec.c b/libavcodec/srv3dec.c
new file mode 100644
index 0000000000..a924c1e50f
--- /dev/null
+++ b/libavcodec/srv3dec.c
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2024 Hubert Głuchowski
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * SRV3/YTT subtitle decoder
+ * @see https://github.com/arcusmaximus/YTSubConverter
+ */
+
+#include "avcodec.h"
+#include "ass.h"
+#include "codec_internal.h"
+#include "libavformat/srv3.h"
+#include "libavutil/bprint.h"
+#include "version.h"
+
+const int PLAY_RES_X = 1280;
+const int PLAY_RES_Y = 720;
+const int BASE_FONT_SIZE = 38;
+
+// From https://github.com/arcusmaximus/YTSubConverter/blob/38fb2ab469f37e8f3a5a6a27adf91d9d0e81ea4f/YTSubConverter.Shared/Formats/YttDocument.cs#L1123
+static const char *srv3_font_style_to_font_name(int font_style) {
+    switch(font_style) {
+    case 1:
+        return  "Courier New";
+    case 2:
+        return "Times New Roman";
+    case 3:
+        return "Lucida Console";
+    case 4:
+        return "Comic Sans Ms";
+    case 6:
+        return "Monotype Corsiva";
+    case 7:
+        return "Carrois Gothic Sc";
+    default:
+        return "Roboto";
+    };
+}
+
+static int srv3_point_to_ass_alignment(int point) {
+    if (point >= 6)
+        return point - 5;
+    else if (point < 3)
+        return point + 7;
+    return point + 1;
+}
+
+static int srv3_coord_to_ass(int coord, int max) {
+    return (2.0 + coord * 0.96) / 100.0 * max;
+}
+
+static float srv3_font_size_to_ass(int size) {
+    return BASE_FONT_SIZE * (1.0 + ((size / 100.0) - 1.0) / 4.0);
+}
+
+#define RGB2BGR(color) (((color) & 0x0000FF) << 16 | ((color) & 0x00FF00) | ((color) & 0xFF0000) >> 16)
+#define RGB2ASS(color, alpha) RGB2BGR(color) | ((0xFF - (alpha)) << 24)
+#define ASSBOOL(value) ((value) > 0) * -1
+
+static void srv3_style_segment(AVCodecContext *ctx, AVBPrint *buf, SRV3Segment *segment) {
+    av_bprintf(buf, "{\\rP%i}", segment->pen->id + 1);
+
+    if (segment->pen->background_alpha == 0) {
+        switch(segment->pen->edge_type) {
+        case SRV3_EDGE_HARD_SHADOW:
+            av_bprintf(buf, "{\\shad2}");
+            break;
+        /*
+         * I think falling back to a glow effect on soft shadow is better than just using a normal shadow.
+         * YTSubConverter doesn't agree with me on this and I'm not completely sure whether it's the right choice.
+         */
+        case SRV3_EDGE_SOFT_SHADOW:
+        case SRV3_EDGE_GLOW:
+            av_bprintf(buf, "{\\bord2\\blur3}");
+            break;
+        case SRV3_EDGE_BEVEL:
+            av_bprintf(buf, "{\\shad2}");
+            break;
+        case SRV3_EDGE_NONE:
+            break;
+        default:
+            av_log(ctx, AV_LOG_WARNING, "bug: Unhandled edge type %i in decoder\n", segment->pen->edge_type);
+            break;
+        }
+    } else if (segment->pen->edge_type) {
+        /*
+         * ASS doesn't support text shadows or outlines with BorderStyle 3.
+         * TODO: Add an option to enable BorderStyle 4 usage
+         */
+    }
+}
+
+static void srv3_process_text(AVBPrint *buf, const char *text, int count) {
+    for (int i = 0; i < count; ++i) {
+        if (text[i] == '\r')
+            continue;
+        else if (text[i] == '\n')
+            av_bprintf(buf, "\\N");
+        else
+            av_bprintf(buf, "%c", text[i]);
+    }
+}
+
+static void srv3_position_event(SRV3EventMeta *event, int *x, int *y, int *align) {
+    if (event->wp) {
+        *x = srv3_coord_to_ass(event->wp->x , PLAY_RES_X);
+        *y = srv3_coord_to_ass(event->wp->y, PLAY_RES_Y);
+        *align = srv3_point_to_ass_alignment(event->wp->point);
+    } else {
+        *x = srv3_coord_to_ass(50, PLAY_RES_X);
+        *y = srv3_coord_to_ass(100, PLAY_RES_Y);
+        *align = 2;
+    }
+}
+
+static void srv3_event_text_ass(AVCodecContext *ctx, AVBPrint *buf, const char *text, SRV3EventMeta *event)
+{
+    SRV3Segment *segment;
+    int x, y, alignment;
+
+    srv3_position_event(event, &x, &y, &alignment);
+    av_bprintf(buf, "{\\an%i\\pos(%i,%i)}", alignment, x, y);
+
+    for (segment = event->segments; segment; segment = segment->next) {
+        srv3_style_segment(ctx, buf, segment);
+        srv3_process_text(buf, text, segment->size);
+        text += segment->size;
+    }
+}
+
+static int srv3_decode_frame(AVCodecContext *avctx, AVSubtitle *sub,
+                             int *got_sub_ptr, const AVPacket *avpkt)
+{
+    int ret = 0;
+    FFASSDecoderContext *ctx = avctx->priv_data;
+    const char *text = avpkt->data;
+    SRV3EventMeta *event = (SRV3EventMeta*)av_packet_get_side_data(avpkt, AV_PKT_DATA_SRV3_EVENT, NULL);
+    AVBPrint buf;
+
+    if (!text || avpkt->size == 0)
+        return 0;
+
+    av_bprint_init(&buf, 0, AV_BPRINT_SIZE_UNLIMITED);
+
+    srv3_event_text_ass(avctx, &buf, text, event);
+    if (av_bprint_is_complete(&buf))
+        ret = ff_ass_add_rect(sub, buf.str, ctx->readorder++, 0, NULL, NULL);
+    else
+        ret = AVERROR(ENOMEM);
+
+    av_bprint_finalize(&buf, NULL);
+
+    if (ret < 0)
+        return ret;
+    *got_sub_ptr = sub->num_rects > 0;
+    return avpkt->size;
+}
+
+static av_cold int srv3_decoder_init(AVCodecContext *avctx) {
+    int ret = 0;
+    AVBPrint header;
+    const AVPacketSideData *head_sd;
+    SRV3Pen *pen;
+
+    av_bprint_init(&header, 0, AV_BPRINT_SIZE_UNLIMITED);
+
+    av_bprintf(&header,
+               "[Script Info]\r\n"
+               "; Script generated by FFmpeg/Lavc%s\r\n"
+               "ScriptType: v4.00+\r\n"
+               "PlayResX: %i\r\n"
+               "PlayResY: %i\r\n"
+               "WrapStyle: 0\r\n"
+               "ScaledBorderAndShadow: yes\r\n"
+               "YCbCr Matrix: None\r\n"
+               "\r\n"
+               "[V4+ Styles]\r\n"
+               "Format: Name, "
+               "Fontname, Fontsize, "
+               "PrimaryColour, SecondaryColour, OutlineColour, BackColour, "
+               "Bold, Italic, Underline, StrikeOut, "
+               "ScaleX, ScaleY, "
+               "Spacing, Angle, "
+               "BorderStyle, Outline, Shadow, "
+               "Alignment, MarginL, MarginR, MarginV, "
+               "Encoding\r\n",
+               !(avctx->flags & AV_CODEC_FLAG_BITEXACT) ? AV_STRINGIFY(LIBAVCODEC_VERSION) : "",
+               PLAY_RES_X, PLAY_RES_Y);
+
+    head_sd = av_packet_side_data_get(avctx->coded_side_data, avctx->nb_coded_side_data, AV_PKT_DATA_SRV3_HEAD);
+    if (head_sd) {
+        for (pen = ((SRV3Head*)head_sd->data)->pens; pen; pen = pen->next)
+            av_bprintf(&header,
+                       "Style: "
+                       "P%i,"                 /* Name */
+                       "%s,%f,"               /* Font{name,size} */
+                       "&H%x,&H0,&H%x,&H%x,"  /* {Primary,Secondary,Outline,Back}Colour */
+                       "%i,%i,0,0,"           /* Bold, Italic, Underline, StrikeOut */
+                       "100,100,"             /* Scale{X,Y} */
+                       "0,0,"                 /* Spacing, Angle */
+                       "%i,%i,0,"             /* BorderStyle, Outline, Shadow */
+                       "2,0,0,0,"             /* Alignment, Margin[LRV] */
+                       "1\r\n",               /* Encoding */
+                       pen->id + 1,
+                       srv3_font_style_to_font_name(pen->font_style), srv3_font_size_to_ass(pen->font_size),
+                       RGB2ASS(pen->foreground_color, pen->foreground_alpha),
+                       pen->background_alpha > 0
+                           ? RGB2ASS(pen->background_color, pen->background_alpha)
+                           : RGB2ASS(pen->edge_color, pen->foreground_alpha),
+                       pen->background_alpha > 0
+                           ? RGB2ASS(pen->background_color, pen->background_alpha)
+                           : RGB2ASS(pen->edge_color, pen->foreground_alpha),
+                       ASSBOOL(pen->attrs & SRV3_PEN_ATTR_BOLD), ASSBOOL(pen->attrs & SRV3_PEN_ATTR_ITALIC),
+                       pen->background_alpha > 0 ? 3 : (pen->edge_type > 0), pen->background_alpha > 0);
+    }
+
+    av_bprintf(&header,
+               "[Events]\r\n"
+               "Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\r\n");
+
+    av_bprint_finalize(&header, (char**)&avctx->subtitle_header);
+    if (!avctx->subtitle_header) {
+        ret = AVERROR(ENOMEM);
+        goto end;
+    }
+    avctx->subtitle_header_size = header.len;
+
+end:
+    av_bprint_finalize(&header, NULL);
+    return ret;
+}
+
+const FFCodec ff_srv3_decoder = {
+    .p.name         = "srv3",
+    CODEC_LONG_NAME("SRV3 subtitle"),
+    .p.type         = AVMEDIA_TYPE_SUBTITLE,
+    .p.id           = AV_CODEC_ID_SRV3,
+    FF_CODEC_DECODE_SUB_CB(srv3_decode_frame),
+    .init           = srv3_decoder_init,
+    .flush          = ff_ass_decoder_flush,
+    .priv_data_size = sizeof(FFASSDecoderContext),
+};
diff --git a/libavformat/Makefile b/libavformat/Makefile
index dd96bf7ba8..1c44f35bbc 100644
--- a/libavformat/Makefile
+++ b/libavformat/Makefile
@@ -570,6 +570,7 @@ OBJS-$(CONFIG_SPEEX_MUXER)               += oggenc.o \
                                             vorbiscomment.o
 OBJS-$(CONFIG_SRT_DEMUXER)               += srtdec.o subtitles.o
 OBJS-$(CONFIG_SRT_MUXER)                 += srtenc.o
+OBJS-$(CONFIG_SRV3_DEMUXER)              += srv3dec.o subtitles.o
 OBJS-$(CONFIG_STL_DEMUXER)               += stldec.o subtitles.o
 OBJS-$(CONFIG_STR_DEMUXER)               += psxstr.o
 OBJS-$(CONFIG_STREAMHASH_MUXER)          += hashenc.o
diff --git a/libavformat/allformats.c b/libavformat/allformats.c
index 445f13f42a..f56eb34a90 100644
--- a/libavformat/allformats.c
+++ b/libavformat/allformats.c
@@ -451,6 +451,7 @@ extern const FFInputFormat  ff_spdif_demuxer;
 extern const FFOutputFormat ff_spdif_muxer;
 extern const FFInputFormat  ff_srt_demuxer;
 extern const FFOutputFormat ff_srt_muxer;
+extern const FFInputFormat  ff_srv3_demuxer;
 extern const FFInputFormat  ff_str_demuxer;
 extern const FFInputFormat  ff_stl_demuxer;
 extern const FFOutputFormat ff_streamhash_muxer;
diff --git a/libavformat/srv3.h b/libavformat/srv3.h
new file mode 100644
index 0000000000..45bf997654
--- /dev/null
+++ b/libavformat/srv3.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2024 Hubert Głuchowski
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVFORMAT_SRV3_H
+#define AVFORMAT_SRV3_H
+
+#include "avformat.h"
+#include "internal.h"
+
+enum SRV3PenAttrs {
+    SRV3_PEN_ATTR_ITALIC = 1,
+    SRV3_PEN_ATTR_BOLD = 2,
+};
+
+// https://github.com/arcusmaximus/YTSubConverter/blob/38fb2ab469f37e8f3a5a6a27adf91d9d0e81ea4f/YTSubConverter.Shared/Formats/YttDocument.cs#L1019C14-L1019C14
+enum SRV3EdgeType {
+    SRV3_EDGE_NONE = 0,
+    SRV3_EDGE_HARD_SHADOW = 1,
+    SRV3_EDGE_BEVEL = 2,
+    SRV3_EDGE_GLOW = 3,
+    SRV3_EDGE_SOFT_SHADOW = 4,
+};
+
+enum SRV3RubyPart {
+    SRV3_RUBY_NONE = 0,
+    SRV3_RUBY_BASE = 1,
+    SRV3_RUBY_PARENTHESIS = 2,
+    SRV3_RUBY_BEFORE = 4,
+    SRV3_RUBY_AFTER = 5,
+};
+
+typedef struct SRV3Pen {
+    int id;
+
+    int font_size, font_style;
+    int attrs;
+
+    int edge_type, edge_color;
+
+    int ruby_part;
+
+    int foreground_color, foreground_alpha;
+    int background_color, background_alpha;
+
+    struct SRV3Pen *next;
+} SRV3Pen;
+
+typedef struct SRV3WindowPos {
+    int id;
+
+    int point, x, y;
+
+    struct SRV3WindowPos *next;
+} SRV3WindowPos;
+
+typedef struct SRV3Head {
+    SRV3Pen *pens;
+} SRV3Head;
+
+typedef struct SRV3Segment {
+    int size;
+    SRV3Pen *pen;
+
+    /*
+     * The next segment in the same event.
+     */
+    struct SRV3Segment *next;
+} SRV3Segment;
+
+typedef struct SRV3EventMeta {
+    /*
+    * An ordered list of segments.
+    */
+    SRV3Segment *segments;
+    SRV3WindowPos *wp;
+} SRV3EventMeta;
+
+#endif // AVFORMAT_SRV3_H
diff --git a/libavformat/srv3dec.c b/libavformat/srv3dec.c
new file mode 100644
index 0000000000..c8ccd5cac2
--- /dev/null
+++ b/libavformat/srv3dec.c
@@ -0,0 +1,542 @@
+/*
+ * Copyright (c) 2024 Hubert Głuchowski
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * SRV3/YTT subtitle demuxer
+ * This is a youtube specific subtitle format that utilizes XML.
+ * Because there is currently no official documentation some information about the format,
+ * some information was acquired by reading YTSubConverter code.
+ * @see https://github.com/arcusmaximus/YTSubConverter
+ */
+
+#include <libxml/parser.h>
+#include <libxml/tree.h>
+#include "srv3.h"
+#include "avformat.h"
+#include "demux.h"
+#include "internal.h"
+#include "subtitles.h"
+#include "libavutil/bprint.h"
+#include "libavutil/opt.h"
+#include "libavutil/mem.h"
+
+typedef struct SRV3GlobalSegments {
+    SRV3Segment *list;
+    struct SRV3GlobalSegments *next;
+} SRV3GlobalSegments;
+
+typedef struct SRV3Context {
+    const AVClass *class;
+    FFDemuxSubtitlesQueue q;
+    SRV3Pen *pens;
+    SRV3WindowPos *wps;
+    SRV3GlobalSegments *segments;
+} SRV3Context;
+
+static SRV3Pen srv3_default_pen = {
+    .id = -1,
+
+    .font_size = 100,
+    .font_style = 0,
+    .attrs = 0,
+
+    .edge_type = 0,
+    .edge_color = 0x020202,
+
+    .ruby_part = SRV3_RUBY_NONE,
+
+    .foreground_color = 0xFFFFFF,
+    .foreground_alpha = 254,
+    .background_color = 0x080808,
+    .background_alpha = 192,
+
+    .next = NULL
+};
+
+static void srv3_free_context_data(SRV3Context *ctx) {
+    void *next;
+
+#define FREE_LIST(type, list, until)                     \
+do {                                                                \
+    for (void *current = list; current && current != until; current = next) {  \
+        next = ((type*)current)->next;                              \
+        av_free(current);                                           \
+    }                                                               \
+} while(0)
+
+    FREE_LIST(SRV3Pen, ctx->pens, &srv3_default_pen);
+    FREE_LIST(SRV3WindowPos, ctx->wps, NULL);
+
+    for (SRV3GlobalSegments *segments = ctx->segments; segments; segments = next) {
+        FREE_LIST(SRV3Segment, segments->list, NULL);
+        next = segments->next;
+        av_free(segments);
+    }
+}
+
+static SRV3Pen *srv3_get_pen(SRV3Context *ctx, int id) {
+    for (SRV3Pen *pen = ctx->pens; pen; pen = pen->next)
+        if (pen->id == id)
+            return pen;
+    return NULL;
+}
+
+static int srv3_probe(const AVProbeData *p)
+{
+    if (strstr(p->buf, "<timedtext format=\"3\">"))
+        return AVPROBE_SCORE_MAX;
+
+    return 0;
+}
+
+static int srv3_parse_numeric_value(SRV3Context *ctx, const char *parent, const char *name, const char *value, int base, int *out, int min, int max)
+{
+    char *endptr;
+    long parsed;
+
+    parsed = strtol(value, &endptr, base);
+
+    if (*endptr != 0) {
+        av_log(ctx, AV_LOG_WARNING, "Failed to parse value \"%s\" of %s attribute %s as an integer\n", value, parent, name);
+        return AVERROR_INVALIDDATA;
+    } else if (parsed < min || parsed > max) {
+        av_log(ctx, AV_LOG_WARNING, "Value %li out of range for %s attribute %s ([%i, %i])\n", parsed, parent, name, min, max);
+        return AVERROR(ERANGE);
+    } else if(out) {
+        *out = parsed;
+        return 0;
+    } else return parsed;
+}
+
+static int srv3_parse_numeric_attr(SRV3Context *ctx, const char *parent, xmlAttrPtr attr, int *out, int min, int max)
+{
+    return srv3_parse_numeric_value(ctx, parent, attr->name, attr->children->content, 10, out, min, max) == 0;
+}
+
+static void srv3_parse_color_attr(SRV3Context *ctx, const char *parent, xmlAttrPtr attr, int *out)
+{
+    srv3_parse_numeric_value(ctx, parent, attr->name, attr->children->content + (*attr->children->content == '#'), 16, out, 0, 0xFFFFFF);
+}
+
+static int srv3_read_pen(SRV3Context *ctx, xmlNodePtr element)
+{
+    SRV3Pen *pen = av_malloc(sizeof(SRV3Pen));
+    if (!pen)
+        return AVERROR(ENOMEM);
+    memcpy(pen, &srv3_default_pen, sizeof(SRV3Pen));
+    pen->next = ctx->pens;
+    ctx->pens = pen;
+
+    for (xmlAttrPtr attr = element->properties; attr; attr = attr->next) {
+        if (!strcmp(attr->name, "id"))
+            srv3_parse_numeric_attr(ctx, "pen", attr, &pen->id, 0, INT_MAX);
+        else if (!strcmp(attr->name, "sz"))
+            srv3_parse_numeric_attr(ctx, "pen", attr, &pen->font_size, 0, INT_MAX);
+        else if (!strcmp(attr->name, "fs"))
+            srv3_parse_numeric_attr(ctx, "pen", attr, &pen->font_style, 1, 7);
+        else if (!strcmp(attr->name, "et"))
+            srv3_parse_numeric_attr(ctx, "pen", attr, &pen->edge_type, 1, 4);
+        else if (!strcmp(attr->name, "ec"))
+            srv3_parse_color_attr(ctx, "pen", attr, &pen->edge_color);
+        else if (!strcmp(attr->name, "fc"))
+            srv3_parse_color_attr(ctx, "pen", attr, &pen->foreground_color);
+        else if (!strcmp(attr->name, "fo"))
+            srv3_parse_numeric_attr(ctx, "pen", attr, &pen->foreground_alpha, 0, 0xFF);
+        else if (!strcmp(attr->name, "bc"))
+            srv3_parse_color_attr(ctx, "pen", attr, &pen->background_color);
+        else if (!strcmp(attr->name, "bo"))
+            srv3_parse_numeric_attr(ctx, "pen", attr, &pen->background_alpha, 0, 0xFF);
+        else if (!strcmp(attr->name, "rb")) {
+            srv3_parse_numeric_attr(ctx, "pen", attr, &pen->ruby_part, 0, 5);
+            /*
+            * For whatever reason three seems to be an unused value for this enum.
+            */
+            if (pen->ruby_part == 3) {
+                pen->ruby_part = 0;
+                av_log(ctx, AV_LOG_WARNING, "Encountered unknown ruby part 3\n");
+            }
+        } else if (!strcmp(attr->name, "i"))
+            pen->attrs |= (!strcmp(attr->children->content, "1")) * SRV3_PEN_ATTR_ITALIC;
+        else if (!strcmp(attr->name, "b"))
+            pen->attrs |= (!strcmp(attr->children->content, "1")) * SRV3_PEN_ATTR_BOLD;
+        else {
+            av_log(ctx, AV_LOG_WARNING, "Unhandled pen property %s\n", attr->name);
+            continue;
+        }
+    }
+
+    return 0;
+}
+
+static int srv3_read_window_pos(SRV3Context *ctx, xmlNodePtr element)
+{
+    SRV3WindowPos *wp = av_mallocz(sizeof(SRV3Pen));
+    if (!wp)
+        return AVERROR(ENOMEM);
+    wp->next = ctx->wps;
+    ctx->wps = wp;
+
+    for (xmlAttrPtr attr = element->properties; attr; attr = attr->next) {
+        if (!strcmp(attr->name, "id"))
+            srv3_parse_numeric_attr(ctx, "window pos", attr, &wp->id, 0, INT_MAX);
+        else if (!strcmp(attr->name, "ap"))
+            srv3_parse_numeric_attr(ctx, "window pos", attr, &wp->point, 0, 8);
+        else if (!strcmp(attr->name, "ah"))
+            srv3_parse_numeric_attr(ctx, "window pos", attr, &wp->x, 0, 100);
+        else if (!strcmp(attr->name, "av"))
+            srv3_parse_numeric_attr(ctx, "window pos", attr, &wp->y, 0, 100);
+        else {
+            av_log(ctx, AV_LOG_WARNING, "Unhandled window pos property %s\n", attr->name);
+            continue;
+        }
+    }
+
+    return 0;
+}
+
+static int srv3_read_pens(SRV3Context *ctx, xmlNodePtr head)
+{
+    int ret;
+
+    for (xmlNodePtr element = head->children; element; element = element->next) {
+        if (!strcmp(element->name, "pen")) {
+            if ((ret = srv3_read_pen(ctx, element)) < 0)
+                return ret;
+        } else if (!strcmp(element->name, "wp")) {
+            if ((ret = srv3_read_window_pos(ctx, element)) < 0)
+                return ret;
+        }
+    }
+
+    return 0;
+}
+
+#define ZERO_WIDTH_SPACE "\u200B"
+#define YTSUBCONV_PADDING_SPACE ZERO_WIDTH_SPACE " " ZERO_WIDTH_SPACE
+
+static int srv3_clean_segment_text(char *text) {
+    char *out = text, *start = text;
+
+    while (1) {
+        char *zw = strstr(start, ZERO_WIDTH_SPACE);
+        char *pad = strstr(start, YTSUBCONV_PADDING_SPACE);
+        char *end = pad ? pad : zw;
+        unsigned cnt = end ? (unsigned)(end - start) : (unsigned)strlen(start);
+
+        memmove(out, start, cnt);
+        out += cnt;
+        if (end) {
+            if (pad)
+                start = pad + strlen(YTSUBCONV_PADDING_SPACE);
+            else
+                start = zw + strlen(ZERO_WIDTH_SPACE);
+        } else break;
+    }
+
+    *out = '\0';
+    return out - text;
+}
+
+static int srv3_read_body(SRV3Context *ctx, xmlNodePtr body)
+{
+    int ret = 0;
+    AVBPrint textbuf;
+    char *text;
+    AVPacket *sub;
+    SRV3WindowPos *wp;
+    SRV3EventMeta *event;
+    int start, duration;
+
+    av_bprint_init(&textbuf, 0, AV_BPRINT_SIZE_UNLIMITED);
+
+    for (xmlNodePtr element = body->children; element; element = element->next) {
+        if (!strcmp(element->name, "p")) {
+            SRV3Segment **segments_tail_next, *segments_tail = NULL;
+            SRV3GlobalSegments *global_segments;
+            int textlen, lastlen = 0;
+            SRV3Pen *event_pen = &srv3_default_pen;
+
+            if ((event = av_mallocz(sizeof(SRV3EventMeta))) == NULL) {
+                ret = AVERROR(ENOMEM);
+                goto end;
+            }
+
+            segments_tail_next = &event->segments;
+
+            for (xmlAttrPtr attr = element->properties; attr; attr = attr->next) {
+                if (!strcmp(attr->name, "t"))
+                    srv3_parse_numeric_attr(ctx, "event", attr, &start, 0, INT_MAX);
+                else if (!strcmp(attr->name, "d"))
+                    srv3_parse_numeric_attr(ctx, "event", attr, &duration, 0, INT_MAX);
+                else if (!strcmp(attr->name, "wp")) {
+                    int id;
+                    srv3_parse_numeric_attr(ctx, "event", attr, &id, 0, INT_MAX);
+                    for (wp = ctx->wps; wp; wp = wp->next)
+                        if (wp->id == id) {
+                            event->wp = wp;
+                            break;
+                        }
+                    if (!event->wp)
+                        av_log(ctx, AV_LOG_WARNING, "Non-existent window pos %i assigned to event\n", id);
+                } else if (!strcmp(attr->name, "p")) {
+                    int id;
+                    if(srv3_parse_numeric_attr(ctx, "event", attr, &id, 0, INT_MAX)) {
+                        SRV3Pen *pen = srv3_get_pen(ctx, id);
+                        if(pen)
+                            event_pen = pen;
+                        else
+                            av_log(ctx, AV_LOG_WARNING, "Non-existent pen %i assigned to event\n", id);
+                    }
+                } else if (!strcmp(attr->name, "ws")) {
+                    // TODO: Handle window styles
+                } else {
+                    av_log(ctx, AV_LOG_WARNING, "Unhandled event property %s\n", attr->name);
+                    continue;
+                }
+            }
+
+            for (xmlNodePtr node = element->children; node; node = node->next) {
+                SRV3Segment *segment;
+
+                if (node->type != XML_ELEMENT_NODE && node->type != XML_TEXT_NODE) {
+                    av_log(ctx, AV_LOG_WARNING, "Unexpected event child node type %i\n", node->type);
+                    continue;
+                } else if(node->type == XML_ELEMENT_NODE && strcmp(node->name, "s")) {
+                    av_log(ctx, AV_LOG_WARNING, "Unknown event child node name %s\n", node->name);
+                    continue;
+                } else if (node->type == XML_ELEMENT_NODE && !node->children)
+                    continue;
+
+                segment = av_mallocz(sizeof(SRV3Segment));
+                if (!segment) {
+                    ret = AVERROR(ENOMEM);
+                    goto end;
+                }
+
+                segment->pen = event_pen;
+
+                if (node->type == XML_ELEMENT_NODE)
+                    for (xmlAttrPtr attr = node->properties; attr; attr = attr->next) {
+                        if (!strcmp(attr->name, "p")) {
+                            int id;
+                            if(srv3_parse_numeric_attr(ctx, "segment", attr, &id, 0, INT_MAX)) {
+                                SRV3Pen *pen = srv3_get_pen(ctx, id);
+                                if(pen)
+                                    segment->pen = pen;
+                                else
+                                    av_log(ctx, AV_LOG_WARNING, "Non-existent pen %i assigned to segment\n", id);
+                            }
+                        } else {
+                            av_log(ctx, AV_LOG_WARNING, "Unhandled segment property %s\n", attr->name);
+                            continue;
+                        }
+                    }
+
+                text = node->type == XML_ELEMENT_NODE ? node->children->content : node->content;
+                textlen = srv3_clean_segment_text(text);
+
+                if (textlen > 0) {
+                    for (int i = 0; i < textlen; ++i)
+                        if (text[i] != '\n' && text[i] != '\r')
+                            goto add_segment;
+
+                    av_bprint_append_data(&textbuf, text, textlen);
+
+                    // If possible append this segment's text to the previous segment
+                    // Otherwise leave it here for it to be prepended to the next segment
+                    if (segments_tail && (segments_tail->pen->font_size == segment->pen->font_size || segment->next == NULL)) {
+                        segments_tail->size += textlen;
+                        lastlen = textbuf.len;
+                    }
+                }
+
+                av_free(segment);
+                continue;
+
+add_segment:
+                av_bprint_append_data(&textbuf, text, textlen);
+
+                segment->size = textbuf.len - lastlen;
+                lastlen = textbuf.len;
+                *segments_tail_next = segment;
+                segments_tail_next = &segment->next;
+                segments_tail = segment;
+            }
+
+            if (!av_bprint_is_complete(&textbuf)) {
+                ret = AVERROR(ENOMEM);
+                goto end;
+            }
+
+            global_segments = av_mallocz(sizeof(SRV3GlobalSegments));
+            if (!global_segments) {
+                ret = AVERROR(ENOMEM);
+                goto end;
+            }
+            global_segments->list = event->segments;
+            global_segments->next = ctx->segments;
+            ctx->segments = global_segments;
+
+            sub = ff_subtitles_queue_insert(&ctx->q, textbuf.str, textbuf.len, 0);
+            if (!sub) {
+                ret = AVERROR(ENOMEM);
+                goto end;
+            }
+            sub->pts = start;
+            sub->duration = duration;
+
+            if ((ret = av_packet_add_side_data(sub, AV_PKT_DATA_SRV3_EVENT, (uint8_t*)event, sizeof(SRV3EventMeta))) < 0)
+               goto end;
+
+            av_bprint_clear(&textbuf);
+        }
+    }
+
+end:
+    av_bprint_finalize(&textbuf, NULL);
+    return ret;
+}
+
+static int srv3_read_header(AVFormatContext *s)
+{
+    int ret = 0;
+    SRV3Context *ctx = s->priv_data;
+    AVPacketSideData *head_sd;
+    SRV3Head *head;
+    AVBPrint content;
+    xmlDocPtr document = NULL;
+    xmlNodePtr root_element;
+    AVStream *st;
+
+    av_bprint_init(&content, 0, INT_MAX);
+
+    st = avformat_new_stream(s, NULL);
+    if (!st) {
+        ret = AVERROR(ENOMEM);
+        goto end;
+    }
+    avpriv_set_pts_info(st, 64, 1, 1000);
+    st->codecpar->codec_type = AVMEDIA_TYPE_SUBTITLE;
+    st->codecpar->codec_id   = AV_CODEC_ID_SRV3;
+    st->disposition = AV_DISPOSITION_CAPTIONS;
+
+    if (!(head_sd = av_packet_side_data_new(&st->codecpar->coded_side_data, &st->codecpar->nb_coded_side_data, AV_PKT_DATA_SRV3_HEAD, sizeof(SRV3Head), 0))) {
+        ret = AVERROR(ENOMEM);
+        goto end;
+    }
+    head = (SRV3Head*)head_sd->data;
+
+    if ((ret = avio_read_to_bprint(s->pb, &content, SIZE_MAX)) < 0)
+        goto end;
+    if (!avio_feof(s->pb) || !av_bprint_is_complete(&content)) {
+        ret = AVERROR_INVALIDDATA;
+        goto end;
+    }
+
+    LIBXML_TEST_VERSION;
+
+    document = xmlReadMemory(content.str, content.len, s->url, NULL, 0);
+
+    if (!document) {
+        ret = AVERROR_INVALIDDATA;
+        goto end;
+    }
+
+    root_element = xmlDocGetRootElement(document);
+
+    for (xmlAttrPtr attr = root_element->properties; attr; attr = attr->next) {
+        if (!strcmp(attr->name, "format")) {
+            if (!attr->children || strcmp(attr->children->content, "3"))
+                av_log(s, AV_LOG_WARNING, "Unrecognized timedtext format version: %s\nParsing will still be attempted but may produce unexpected results\n", attr->children->content);
+        }
+    }
+
+    ctx->pens = &srv3_default_pen;
+
+    for (xmlNodePtr element = root_element->children; element; element = element->next) {
+        if (!strcmp(element->name, "head"))
+            if ((ret = srv3_read_pens(ctx, element)) < 0)
+                goto end;
+    }
+
+    for (xmlNodePtr element = root_element->children; element; element = element->next) {
+        if (!strcmp(element->name, "body"))
+            if ((ret = srv3_read_body(ctx, element)) < 0)
+                goto end;
+    }
+
+    head->pens = ctx->pens;
+    ff_subtitles_queue_finalize(s, &ctx->q);
+
+end:
+    xmlFreeDoc(document);
+    av_bprint_finalize(&content, NULL);
+    return ret;
+}
+
+static int srv3_read_packet(AVFormatContext *s, AVPacket *pkt)
+{
+    SRV3Context *ctx = s->priv_data;
+    return ff_subtitles_queue_read_packet(&ctx->q, pkt);
+}
+
+static int srv3_read_seek(AVFormatContext *s, int stream_index,
+                            int64_t min_ts, int64_t ts, int64_t max_ts, int flags)
+{
+    SRV3Context *ctx = s->priv_data;
+    return ff_subtitles_queue_seek(&ctx->q, s, stream_index,
+                                   min_ts, ts, max_ts, flags);
+}
+
+static int srv3_read_close(AVFormatContext *s)
+{
+    SRV3Context *ctx = s->priv_data;
+    ff_subtitles_queue_clean(&ctx->q);
+    srv3_free_context_data(ctx);
+    return 0;
+}
+
+#define OFFSET(x) offsetof(SRV3Context, x)
+#define KIND_FLAGS AV_OPT_FLAG_SUBTITLE_PARAM|AV_OPT_FLAG_DECODING_PARAM
+
+static const AVOption options[] = {
+    { NULL }
+};
+
+static const AVClass srv3_demuxer_class = {
+    .class_name  = "SRV3 demuxer",
+    .option      = options,
+    .version     = LIBAVUTIL_VERSION_INT,
+};
+
+const FFInputFormat ff_srv3_demuxer = {
+    .p.name         = "srv3",
+    .p.long_name    = NULL_IF_CONFIG_SMALL("SRV3 subtitle"),
+    .p.extensions   = "srv3",
+    .p.priv_class   = &srv3_demuxer_class,
+    .priv_data_size = sizeof(SRV3Context),
+    .flags_internal = FF_INFMT_FLAG_INIT_CLEANUP,
+    .read_probe     = srv3_probe,
+    .read_header    = srv3_read_header,
+    .read_packet    = srv3_read_packet,
+    .read_seek2     = srv3_read_seek,
+    .read_close     = srv3_read_close,
+};
-- 
2.47.0



More information about the ffmpeg-devel mailing list