[FFmpeg-devel] [PATCH 1/2] avformat/hls demuxer: Add WebVTT subtitle support

Fri Feb 21 11:18:02 EET 2025

softworkz:
> From: softworkz <softworkz at hotmail.com>
> 
> This add support for WebVTT subtitles in HLS streams.
> Just like for separate audio streams, it supports all available
> WebVTT streams in all renditions.
> No new options are added, it just works and provides subtitles streams
> like any other demuxer.
> The code prevents downloading subtitle segments which are farther
> in the future than the main segments, to avoid loading hundreds
> of subtitle segments in advance.
> 
> Signed-off-by: softworkz <softworkz at hotmail.com>
> ---
>  libavformat/hls.c | 218 ++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 180 insertions(+), 38 deletions(-)
> 
> diff --git a/libavformat/hls.c b/libavformat/hls.c
> index 3bdc1bc848..51cf013a66 100644
> --- a/libavformat/hls.c
> +++ b/libavformat/hls.c
> @@ -56,6 +56,8 @@
>  #define MPEG_TIME_BASE 90000
>  #define MPEG_TIME_BASE_Q (AVRational){1, MPEG_TIME_BASE}
>  
> +static char *vtt_sample = "WEBVTT\n";

Missing const. And actually it is simpler to avoid this pointer
altogether and just use "WEBVTT\n" below.

> +
>  /*
>   * An apple http stream consists of a playlist with media segment files,
>   * played sequentially. There may be several playlists with the same
> @@ -173,6 +175,7 @@ struct playlist {
>       * playlist, if any. */
>      int n_init_sections;
>      struct segment **init_sections;
> +    int is_subtitle; /* Indicates if it's a subtitle playlist */
>  };
>  
>  /*
> @@ -330,6 +333,7 @@ static struct playlist *new_playlist(HLSContext *c, const char *url,
>          return NULL;
>      }
>      pls->seek_timestamp = AV_NOPTS_VALUE;
> +    pls->is_subtitle = 0;

Is pls not zero-allocated?

>  
>      pls->is_id3_timestamped = -1;
>      pls->id3_mpegts_timestamp = AV_NOPTS_VALUE;
> @@ -515,13 +519,6 @@ static struct rendition *new_rendition(HLSContext *c, struct rendition_info *inf
>          return NULL;
>      }
>  
> -    /* TODO: handle subtitles (each segment has to parsed separately) */
> -    if (c->ctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL)
> -        if (type == AVMEDIA_TYPE_SUBTITLE) {
> -            av_log(c->ctx, AV_LOG_WARNING, "Can't support the subtitle(uri: %s)\n", info->uri);
> -            return NULL;
> -        }
> -
>      rend = av_mallocz(sizeof(struct rendition));
>      if (!rend)
>          return NULL;
> @@ -536,9 +533,14 @@ static struct rendition *new_rendition(HLSContext *c, struct rendition_info *inf
>      /* add the playlist if this is an external rendition */
>      if (info->uri[0]) {
>          rend->playlist = new_playlist(c, info->uri, url_base);
> -        if (rend->playlist)
> +        if (rend->playlist) {
>              dynarray_add(&rend->playlist->renditions,
> -                         &rend->playlist->n_renditions, rend);
> +                        &rend->playlist->n_renditions, rend);

Don't change indentation in this patch.

> +            if (type == AVMEDIA_TYPE_SUBTITLE) {
> +                rend->playlist->is_subtitle = 1;
> +                rend->playlist->is_id3_timestamped = 0;
> +            }

Nit: Put this block before dynarray_add().

> +        }
>      }
>  
>      if (info->assoc_language[0]) {
> @@ -761,6 +763,9 @@ static int test_segment(AVFormatContext *s, const AVInputFormat *in_fmt, struct
>          } else if (!strcmp(in_fmt->name, "mpegts")) {
>              matchF =      av_match_ext(    seg->url, "ts,m2t,m2ts,mts,mpg,m4s,mpeg,mpegts")
>                       + 2*(ff_match_url_ext(seg->url, "ts,m2t,m2ts,mts,mpg,m4s,mpeg,mpegts") > 0);
> +        } else if (!strcmp(in_fmt->name, "webvtt")) {
> +            matchF =      av_match_ext(    seg->url, "vtt,webvtt")
> +                     + 2*(ff_match_url_ext(seg->url, "vtt,webvtt") > 0);
>          }
>  
>          if (!(matchA & matchF)) {
> @@ -969,7 +974,7 @@ static int parse_playlist(HLSContext *c, const char *url,
>              if (ptr)
>                  seg_offset = strtoll(ptr+1, NULL, 10);
>          } else if (av_strstart(line, "#", NULL)) {
> -            av_log(c->ctx, AV_LOG_INFO, "Skip ('%s')\n", line);
> +            av_log(c->ctx, AV_LOG_VERBOSE, "Skip ('%s')\n", line);
>              continue;
>          } else if (line[0]) {
>              if (is_variant) {
> @@ -1484,7 +1489,7 @@ static int playlist_needed(struct playlist *pls)
>      int first_st;
>  
>      /* If there is no context or streams yet, the playlist is needed */
> -    if (!pls->ctx || !pls->n_main_streams)
> +    if ((!pls->ctx || !pls->n_main_streams) && !pls->is_subtitle)
>          return 1;
>  
>      /* check if any of the streams in the playlist are needed */
> @@ -1522,17 +1527,13 @@ static int playlist_needed(struct playlist *pls)
>      return 0;
>  }
>  
> -static int read_data(void *opaque, uint8_t *buf, int buf_size)
> +static int reload_playlist(struct playlist *v, HLSContext *c)
>  {
> -    struct playlist *v = opaque;
> -    HLSContext *c = v->parent->priv_data;
> -    int ret;
> -    int just_opened = 0;
> +    int ret = 0;
>      int reload_count = 0;
> -    int segment_retries = 0;
> -    struct segment *seg;
>  
> -restart:
> +    v->needed = playlist_needed(v);
> +
>      if (!v->needed)
>          return AVERROR_EOF;
>  
> @@ -1588,7 +1589,7 @@ reload:
>              av_log(v->parent, AV_LOG_WARNING, "The m3u8 list sequence may have been wrapped.\n");
>          }
>          if (v->cur_seq_no >= v->start_seq_no + v->n_segments) {
> -            if (v->finished)
> +            if (v->finished || v->is_subtitle)
>                  return AVERROR_EOF;
>              while (av_gettime_relative() - v->last_load_time < reload_interval) {
>                  if (ff_check_interrupt(c->interrupt_callback))
> @@ -1599,9 +1600,35 @@ reload:
>              goto reload;
>          }
>  
> -        v->input_read_done = 0;
> -        seg = current_segment(v);
> +    }
> +    return ret;
> +}
> +
> +static int read_data_continuous(void *opaque, uint8_t *buf, int buf_size)
> +{
> +    struct playlist *v = opaque;
> +    HLSContext *c = v->parent->priv_data;
> +    int ret;
> +    int just_opened = 0;
> +    int segment_retries = 0;
> +    struct segment *seg;
> +
> +    if (c->http_persistent && v->input_read_done) {
> +        ret = reload_playlist(v, c);
> +        if (ret < 0)
> +            return ret;
> +    }
> +
> +    v->input_read_done = 0;
> +
> +restart:
> +    ret = reload_playlist(v, c);
> +    if (ret < 0)
> +        return ret;
>  
> +    seg = current_segment(v);
> +
> +    if (!v->input || (c->http_persistent && v->input_read_done)) {
>          /* load/update Media Initialization Section, if any */
>          ret = update_init_section(v, seg);
>          if (ret)
> @@ -1630,7 +1657,7 @@ reload:
>              } else {
>                  segment_retries++;
>              }
> -            goto reload;
> +            goto restart;
>          }
>          segment_retries = 0;
>          just_opened = 1;
> @@ -1692,6 +1719,110 @@ reload:
>      goto restart;
>  }
>  
> +static int read_data_subtitle_segment(void *opaque, uint8_t *buf, int buf_size)
> +{
> +    struct playlist *v = opaque;
> +    HLSContext *c = v->parent->priv_data;
> +    int ret;
> +    struct segment *seg;
> +
> +    if (!v->needed || v->cur_seq_no - v->start_seq_no >= v->n_segments) {
> +        return AVERROR_EOF;
> +    } else {
> +        seg = current_segment(v);
> +    }
> +
> +    if (!v->input) {
> +        ret = open_input(c, v, seg, &v->input);
> +        if (ret < 0) {
> +            if (ff_check_interrupt(c->interrupt_callback))
> +                return AVERROR_EXIT;
> +            av_log(v->parent, AV_LOG_WARNING, "Failed to open segment of playlist %d\n",
> +                   v->index);
> +            return ret;
> +        }
> +    }
> +
> +    return read_from_url(v, seg, buf, buf_size);
> +}
> +
> +static int nested_io_open(AVFormatContext *s, AVIOContext **pb, const char *url,
> +                          int flags, AVDictionary **opts)
> +{
> +    av_log(s, AV_LOG_ERROR,
> +           "A HLS playlist item '%s' referred to an external file '%s'. "
> +           "Opening this file was forbidden for security reasons\n",
> +           s->url, url);
> +    return AVERROR(EPERM);
> +}
> +
> +static int init_subtitle_context(struct playlist *pls)
> +{
> +    HLSContext *c = pls->parent->priv_data;
> +    const AVInputFormat *in_fmt;
> +    AVDictionary *opts = NULL;
> +    int ret;
> +
> +    if (!(pls->ctx = avformat_alloc_context()))
> +        return AVERROR(ENOMEM);
> +
> +    pls->read_buffer = av_malloc(INITIAL_BUFFER_SIZE);
> +    if (!pls->read_buffer) {
> +        avformat_free_context(pls->ctx);
> +        pls->ctx = NULL;
> +        return AVERROR(ENOMEM);
> +    }
> +
> +    ffio_init_context(&pls->pb, pls->read_buffer, INITIAL_BUFFER_SIZE, 0, pls,
> +                      read_data_subtitle_segment, NULL, NULL);
> +    pls->pb.pub.seekable = 0;
> +    pls->ctx->pb       = &pls->pb.pub;
> +    pls->ctx->io_open  = nested_io_open;
> +
> +    ret = ff_copy_whiteblacklists(pls->ctx, pls->parent);
> +    if (ret < 0)
> +        return ret;
> +
> +    in_fmt = av_find_input_format("webvtt");
> +    av_dict_copy(&opts, c->seg_format_opts, 0);
> +    ret = avformat_open_input(&pls->ctx, current_segment(pls)->url, in_fmt, &opts);
> +    av_dict_free(&opts);
> +
> +    return ret;
> +}
> +
> +static int read_subtitle_packet(struct playlist *v, AVPacket *pkt)
> +{
> +    HLSContext *c = v->parent->priv_data;
> +    int ret;
> +
> +restart:
> +    ret = reload_playlist(v, c);
> +    if (ret < 0)
> +        return ret;
> +
> +    if (v->input && !v->ctx)
> +        ff_format_io_close(v->parent, &v->input);
> +
> +    if (!v->input && !v->ctx) {
> +        ret = init_subtitle_context(v);
> +        if (ret < 0)
> +            return ret;
> +    }
> +
> +    ret = av_read_frame(v->ctx, v->pkt);
> +    if (!ret) {
> +        return ret;
> +    }
> +    ff_format_io_close(v->parent, &v->input);
> +    v->cur_seq_no++;
> +    c->cur_seq_no = v->cur_seq_no;
> +
> +    avformat_close_input(&v->ctx);
> +
> +    goto restart;
> +}
> +
>  static void add_renditions_to_variant(HLSContext *c, struct variant *var,
>                                        enum AVMediaType type, const char *group_id)
>  {
> @@ -1853,16 +1984,6 @@ static int64_t select_cur_seq_no(HLSContext *c, struct playlist *pls)
>      return pls->start_seq_no;
>  }
>  
> -static int nested_io_open(AVFormatContext *s, AVIOContext **pb, const char *url,
> -                          int flags, AVDictionary **opts)
> -{
> -    av_log(s, AV_LOG_ERROR,
> -           "A HLS playlist item '%s' referred to an external file '%s'. "
> -           "Opening this file was forbidden for security reasons\n",
> -           s->url, url);
> -    return AVERROR(EPERM);
> -}
> -
>  static void add_stream_to_programs(AVFormatContext *s, struct playlist *pls, AVStream *stream)
>  {
>      HLSContext *c = s->priv_data;
> @@ -2070,6 +2191,8 @@ static int hls_read_header(AVFormatContext *s)
>          highest_cur_seq_no = FFMAX(highest_cur_seq_no, pls->cur_seq_no);
>      }
>  
> +    av_dict_set(&c->seg_format_opts, "prefer_hls_mpegts_pts", "1", 0);
> +
>      /* Open the demuxer for each playlist */
>      for (i = 0; i < c->n_playlists; i++) {
>          struct playlist *pls = c->playlists[i];
> @@ -2107,8 +2230,12 @@ static int hls_read_header(AVFormatContext *s)
>              return AVERROR(ENOMEM);
>          }
>  
> -        ffio_init_context(&pls->pb, pls->read_buffer, INITIAL_BUFFER_SIZE, 0, pls,
> -                          read_data, NULL, NULL);
> +        if (pls->is_subtitle)
> +            ffio_init_context(&pls->pb, (unsigned char*)av_strdup(vtt_sample), (int)strlen(vtt_sample), 0, pls,
> +                                        NULL, NULL, NULL);
> +        else
> +            ffio_init_context(&pls->pb, pls->read_buffer, INITIAL_BUFFER_SIZE, 0, pls,
> +                                        read_data_continuous, NULL, NULL);

1. Unchecked av_strdup().
2. Is duplicating the string even needed? Can't we simply set the
AVIOContext to NULL before closing the AVFormatContext?

>  
>          /*
>           * If encryption scheme is SAMPLE-AES, try to read  ID3 tags of
> @@ -2254,6 +2381,13 @@ static int hls_read_header(AVFormatContext *s)
>          if (pls->n_main_streams)
>              av_dict_copy(&pls->main_streams[0]->metadata, pls->ctx->metadata, 0);
>  
> +        if (pls->is_subtitle) {
> +            avformat_free_context(pls->ctx);

Doesn't the copy of vtt_sample leak here?

> +            pls->ctx = NULL;
> +            pls->needed = 0;
> +            pls->main_streams[0]->discard = AVDISCARD_ALL;
> +        }
> +
>          add_metadata_from_renditions(s, pls, AVMEDIA_TYPE_AUDIO);
>          add_metadata_from_renditions(s, pls, AVMEDIA_TYPE_VIDEO);
>          add_metadata_from_renditions(s, pls, AVMEDIA_TYPE_SUBTITLE);
> @@ -2296,6 +2430,8 @@ static int recheck_discard_flags(AVFormatContext *s, int first)
>              pls->input_read_done = 0;
>              ff_format_io_close(pls->parent, &pls->input_next);
>              pls->input_next_requested = 0;
> +            if (pls->is_subtitle)
> +                avformat_close_input(&pls->ctx);
>              pls->needed = 0;
>              changed = 1;
>              av_log(s, AV_LOG_INFO, "No longer receiving playlist %d\n", i);
> @@ -2363,7 +2499,10 @@ static int hls_read_packet(AVFormatContext *s, AVPacket *pkt)
>                  int64_t ts_diff;
>                  AVRational tb;
>                  struct segment *seg = NULL;
> -                ret = av_read_frame(pls->ctx, pls->pkt);
> +                if (pls->is_subtitle)
> +                    ret = read_subtitle_packet(pls, pls->pkt);
> +                else
> +                    ret = av_read_frame(pls->ctx, pls->pkt);
>                  if (ret < 0) {
>                      if (!avio_feof(&pls->pb.pub) && ret != AVERROR_EOF)
>                          return ret;
> @@ -2559,7 +2698,10 @@ static int hls_read_seek(AVFormatContext *s, int stream_index,
>          /* Reset the pos, to let the mpegts/mov demuxer know we've seeked. */
>          pb->pos = 0;
>          /* Flush the packet queue of the subdemuxer. */
> -        ff_read_frame_flush(pls->ctx);
> +        if (pls->ctx)
> +            ff_read_frame_flush(pls->ctx);
> +        if (pls->is_subtitle)
> +            avformat_close_input(&pls->ctx);
>  
>          /* Reset the init segment so it's re-fetched and served appropiately */
>          pls->cur_init_section = NULL;
> @@ -2628,7 +2770,7 @@ static const AVOption hls_options[] = {
>          OFFSET(prefer_x_start), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, FLAGS},
>      {"allowed_extensions", "List of file extensions that hls is allowed to access",
>          OFFSET(allowed_extensions), AV_OPT_TYPE_STRING,
> -        {.str = "3gp,aac,avi,ac3,eac3,flac,mkv,m3u8,m4a,m4s,m4v,mpg,mov,mp2,mp3,mp4,mpeg,mpegts,ogg,ogv,oga,ts,vob,wav"},
> +        {.str = "3gp,aac,avi,ac3,eac3,flac,mkv,m3u8,m4a,m4s,m4v,mpg,mov,mp2,mp3,mp4,mpeg,mpegts,ogg,ogv,oga,ts,vob,vtt,wav,webvtt"},
>          INT_MIN, INT_MAX, FLAGS},
>      {"extension_picky", "Be picky with all extensions matching",
>          OFFSET(extension_picky), AV_OPT_TYPE_BOOL, {.i64 = 1}, 0, 1, FLAGS},