[FFmpeg-devel] [PATCH] lavf: JSON captions demuxer.

Tue Nov 20 21:43:18 CET 2012

On Tue, Nov 20, 2012 at 08:40:43PM +0100, Nicolas George wrote:
> TODO version bump.
> 
> Signed-off-by: Nicolas George <nicolas.george at normalesup.org>
> ---
>  Changelog                     |    1 +
>  doc/demuxers.texi             |   21 +++
>  libavformat/Makefile          |    1 +
>  libavformat/allformats.c      |    1 +
>  libavformat/jsoncaptionsdec.c |  350 +++++++++++++++++++++++++++++++++++++++++

Please add an entry in general.texi.

>  5 files changed, 374 insertions(+)
>  create mode 100644 libavformat/jsoncaptionsdec.c
> 
> diff --git a/Changelog b/Changelog
> index 783c4c3..a460cfb 100644
> --- a/Changelog
> +++ b/Changelog
> @@ -26,6 +26,7 @@ version <next>:
>  - remove ffserver daemon mode
>  - AST demuxer
>  - new expansion syntax for drawtext
> +- JSON captions (used in TED talks) decoding support
>  
>  
>  version 1.0:
> diff --git a/doc/demuxers.texi b/doc/demuxers.texi
> index aea4c54..7b752c1 100644
> --- a/doc/demuxers.texi
> +++ b/doc/demuxers.texi
> @@ -184,4 +184,25 @@ the script is directly played, the actual times will match the absolute
>  timestamps up to the sound controller's clock accuracy, but if the user
>  somehow pauses the playback or seeks, all times will be shifted accordingly.
>  
> + at section jsoncaptions
> +
> +JSON captions used for @url{http://www.ted.com/, TED Talks}.
> +
> +TED does not provide links to the captions, but they can be guessed from the
> +page. The following bookmarklet can find them:
> + at url{javascript:(function()%7bd%3Dwindow.open%28%22%22%2C%22sub%22%2C%22width%3D256%2Cheight%3D512%2Cresizable%3Dyes%2Cscrollbars%3Dyes%22%29.document%3B%20l%3Ddocument.getElementById%28%22languageCode%22%29.getElementsByTagName%28%22option%22%29%3B%20for%28i%3D1%3Bi%3Cl.length%3Bi++%29%7B%20d.body.appendChild%28p%3Dd.createElement%28%22p%22%29%29%3B%20p.appendChild%28a%3Dd.createElement%28%22a%22%29%29%3B%20a.appendChild%28d.createTextNode%28l%5Bi%5D.textContent%29%29%3B%20a.href%3D%22http%3A//www.ted.com/talks/subtitles/id/%22%20+%20talkID+%22/lang/%22+l%5Bi%5D.value%3B%20%7D%20%7d)();void%200, TED Talks captions}.

Erk…

Can't you point out a library like libquvi (if it supports this) or
something for such task? Or maybe just add a script in tools/.

> +
> +This demuxer accepts the following option:
> + at table @option
> + at item start_time
> +Set the start time of the TED talk, in milliseconds. The default is 15s.

"15000 (15s)"

> +It is used to sync the captions with the downloadable videos, because they
> +include a 15s intro.
> + at end table
> +
> +Example: convert the captions to a format most players understand:
> + at example
> +ffmpeg -i http://www.ted.com/talks/subtitles/id/1/lang/en talk1-en.srt

Since it's an example, maybe use a more uncommon id than 1, such 685?

> + at end example
> +
>  @c man end INPUT DEVICES
> diff --git a/libavformat/Makefile b/libavformat/Makefile
> index 136ada8..6537307 100644
> --- a/libavformat/Makefile
> +++ b/libavformat/Makefile
> @@ -149,6 +149,7 @@ OBJS-$(CONFIG_IVF_DEMUXER)               += ivfdec.o
>  OBJS-$(CONFIG_IVF_MUXER)                 += ivfenc.o
>  OBJS-$(CONFIG_JACOSUB_DEMUXER)           += jacosubdec.o
>  OBJS-$(CONFIG_JACOSUB_MUXER)             += jacosubenc.o rawenc.o
> +OBJS-$(CONFIG_JSONCAPTIONS_DEMUXER)      += jsoncaptionsdec.o
>  OBJS-$(CONFIG_JV_DEMUXER)                += jvdec.o
>  OBJS-$(CONFIG_LATM_DEMUXER)              += rawdec.o
>  OBJS-$(CONFIG_LATM_MUXER)                += latmenc.o rawenc.o
> diff --git a/libavformat/allformats.c b/libavformat/allformats.c
> index d08c134..c292ada 100644
> --- a/libavformat/allformats.c
> +++ b/libavformat/allformats.c
> @@ -131,6 +131,7 @@ void av_register_all(void)
>      REGISTER_DEMUXER  (IV8, iv8);
>      REGISTER_MUXDEMUX (IVF, ivf);
>      REGISTER_MUXDEMUX (JACOSUB, jacosub);
> +    REGISTER_DEMUXER  (JSONCAPTIONS, jsoncaptions);
>      REGISTER_DEMUXER  (JV, jv);
>      REGISTER_MUXDEMUX (LATM, latm);
>      REGISTER_DEMUXER  (LMLM4, lmlm4);
> diff --git a/libavformat/jsoncaptionsdec.c b/libavformat/jsoncaptionsdec.c
> new file mode 100644
> index 0000000..a6b496b
> --- /dev/null
> +++ b/libavformat/jsoncaptionsdec.c
> @@ -0,0 +1,350 @@
> +/*
> + * JSON captions format decoder
> + * Copyright (c) 2012 Nicolas George
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/bprint.h"
> +#include "libavutil/log.h"
> +#include "libavutil/opt.h"
> +#include "avformat.h"
> +#include "internal.h"
> +#include "subtitles.h"
> +
> +typedef struct {
> +    AVClass *class;
> +    int64_t start_time;
> +    FFDemuxSubtitlesQueue subs;
> +} JSONCaptionsDemuxer;
> +
> +static const AVOption json_captions_options[] = {
> +    { "start_time", "set the start time (offset) of the subtitles, in ms",
> +      offsetof(JSONCaptionsDemuxer, start_time), FF_OPT_TYPE_INT64,
> +      { .i64 = 15000 }, INT64_MIN, INT64_MAX,
> +      AV_OPT_FLAG_SUBTITLE_PARAM | AV_OPT_FLAG_DECODING_PARAM },
> +    { NULL },
> +};
> +
> +static const AVClass json_captions_demuxer_class = {
> +    .class_name = "json_captions_demuxer",
> +    .item_name  = av_default_item_name,
> +    .option     = json_captions_options,
> +    .version    = LIBAVUTIL_VERSION_INT,
> +};
> +
> +#define HEX_DIGIT_TEST(c) ( (unsigned)(c)       - '0' <= 9 || \
> +                           ((unsigned)(c) | 32) - 'a' <= 5)
> +#define HEX_DIGIT_VAL(c) ((c) <= '9' ? (c) - '0' : ((c) | 32) - 'a' + 10)
> +#define ERR_CODE(c) (c < 0 ? c : AVERROR_INVALIDDATA)
> +
> +static void av_bprint_utf8(AVBPrint *bp, unsigned c)
> +{
> +    int bytes, i;
> +
> +    if (c <= 0x7F) {
> +        av_bprint_chars(bp, c, 1);
> +        return;
> +    }
> +    bytes = (av_log2(c) - 2) / 5;
> +    av_bprint_chars(bp, (c >> (bytes * 6)) | ((0xFF80 >> bytes) & 0xFF), 1);
> +    for (i = bytes - 1; i >= 0; i--)
> +        av_bprint_chars(bp, ((c >> (i * 6)) & 0x3F) | 0x80, 1);
> +}
> +
> +static void next_byte(AVIOContext *pb, int *cur_byte)
> +{
> +    uint8_t b;
> +    int ret = avio_read(pb, &b, 1);
> +    *cur_byte = ret > 0 ? b : ret == 0 ? AVERROR_EOF : ret;
> +}
> +

Note: I wonder if this kind of code couldn't be shared with some other
subtitles demuxers, such as WebVTT.

> +static void skip_spaces(AVIOContext *pb, int *cur_byte)
> +{
> +    while (*cur_byte == ' '  || *cur_byte == '\t' ||
> +           *cur_byte == '\n' || *cur_byte == '\r')
> +        next_byte(pb, cur_byte);
> +}
> +
> +static int expect_byte(AVIOContext *pb, int *cur_byte, uint8_t c)
> +{
> +    skip_spaces(pb, cur_byte);
> +    if (*cur_byte != c)
> +        return ERR_CODE(*cur_byte);
> +    next_byte(pb, cur_byte);
> +    return 0;
> +}
> +
> +static int parse_string(AVIOContext *pb, int *cur_byte, AVBPrint *bp, int full)
> +{
> +    int ret;
> +
> +    av_bprint_init(bp, 0, full ? -1 : 1);
> +    ret = expect_byte(pb, cur_byte, '"');
> +    if (ret < 0)
> +        goto fail;
> +    while (*cur_byte > 0 && *cur_byte != '"') {
> +        if (*cur_byte == '\\') {
> +            next_byte(pb, cur_byte);
> +            if (*cur_byte < 0) {
> +                ret = AVERROR_INVALIDDATA;
> +                goto fail;
> +            }
> +            if ((*cur_byte | 32) == 'u') {
> +                unsigned chr = 0, i;
> +                for (i = 0; i < 4; i++) {
> +                    next_byte(pb, cur_byte);
> +                    if (!HEX_DIGIT_TEST(*cur_byte)) {
> +                        ret = ERR_CODE(*cur_byte);
> +                        goto fail;
> +                    }
> +                    chr = chr * 16 + HEX_DIGIT_VAL(*cur_byte);
> +                }
> +                av_bprint_utf8(bp, chr);
> +            } else {
> +                av_bprint_chars(bp, *cur_byte, 1);
> +            }
> +        } else {
> +            av_bprint_chars(bp, *cur_byte, 1);
> +        }
> +        next_byte(pb, cur_byte);
> +    }
> +    ret = expect_byte(pb, cur_byte, '"');
> +    if (ret < 0)
> +        goto fail;
> +    if (full && !av_bprint_is_complete(bp)) {
> +        ret = AVERROR(ENOMEM);
> +        goto fail;
> +    }
> +    return 0;
> +
> +fail:
> +    av_bprint_finalize(bp, NULL);
> +    return ret;
> +}
> +
> +static int parse_label(AVIOContext *pb, int *cur_byte, AVBPrint *bp)
> +{
> +    int ret;
> +
> +    ret = parse_string(pb, cur_byte, bp, 0);
> +    if (ret < 0)
> +        return ret;
> +    ret = expect_byte(pb, cur_byte, ':');
> +    if (ret < 0)
> +        return ret;
> +    return 0;
> +}
> +
> +static int parse_boolean(AVIOContext *pb, int *cur_byte, int *result)
> +{
> +    const char *text[] = { "false", "true" }, *p;
> +    int i;
> +
> +    skip_spaces(pb, cur_byte);
> +    for (i = 0; i < 2; i++) {
> +        p = text[i];
> +        if (*cur_byte != *p)
> +            continue;
> +        for (; *p; p++, next_byte(pb, cur_byte))
> +            if (*cur_byte != *p)
> +                return AVERROR_INVALIDDATA;
> +        if ((((unsigned)*cur_byte) | 32) - 'a' <= 'z' - 'a')

I may be missing something obvious, but is the -'a' necessary?

> +            return AVERROR_INVALIDDATA;
> +        *result = i;
> +        return 0;
> +    }
> +    return AVERROR_INVALIDDATA;
> +}
> +
> +static int parse_int(AVIOContext *pb, int *cur_byte, int64_t *result)
> +{
> +    int64_t val = 0;
> +
> +    skip_spaces(pb, cur_byte);
> +    if ((unsigned)*cur_byte - '0' > 9)
> +        return AVERROR_INVALIDDATA;
> +    while ((unsigned)*cur_byte - '0' <= 9) {
> +        val = val * 10 + (*cur_byte - '0');
> +        next_byte(pb, cur_byte);
> +    }
> +    *result = val;
> +    return 0;
> +}
> +
> +static int parse_file(AVIOContext *pb, FFDemuxSubtitlesQueue *subs)
> +{
> +    int ret, cur_byte, start_of_par;
> +    AVBPrint label, content;
> +    int64_t pos, start, duration;
> +    AVPacket *pkt;
> +
> +    next_byte(pb, &cur_byte);
> +    ret = expect_byte(pb, &cur_byte, '{');
> +    if (ret < 0)
> +        return AVERROR_INVALIDDATA;
> +    ret = parse_label(pb, &cur_byte, &label);
> +    if (ret < 0 || strcmp(label.str, "captions"))
> +        return AVERROR_INVALIDDATA;
> +    ret = expect_byte(pb, &cur_byte, '[');
> +    if (ret < 0)
> +        return AVERROR_INVALIDDATA;
> +    while (1) {
> +        content.size = 0;
> +        start = duration = AV_NOPTS_VALUE;
> +        ret = expect_byte(pb, &cur_byte, '{');
> +        if (ret < 0)
> +            return ret;
> +        pos = avio_tell(pb) - 1;
> +        while (1) {
> +            ret = parse_label(pb, &cur_byte, &label);
> +            if (ret < 0)
> +                return ret;
> +            if (!strcmp(label.str, "startOfParagraph")) {
> +                ret = parse_boolean(pb, &cur_byte, &start_of_par);
> +                if (ret < 0)
> +                    return ret;
> +            } else if (!strcmp(label.str, "content")) {
> +                ret = parse_string(pb, &cur_byte, &content, 1);
> +                if (ret < 0)
> +                    return ret;
> +            } else if (!strcmp(label.str, "startTime")) {
> +                ret = parse_int(pb, &cur_byte, &start);
> +                if (ret < 0)
> +                    return ret;
> +            } else if (!strcmp(label.str, "duration")) {
> +                ret = parse_int(pb, &cur_byte, &duration);
> +                if (ret < 0)
> +                    return ret;
> +            } else {
> +                return AVERROR_INVALIDDATA;
> +            }
> +            skip_spaces(pb, &cur_byte);
> +            if (cur_byte != ',')
> +                break;
> +            next_byte(pb, &cur_byte);
> +        }
> +        ret = expect_byte(pb, &cur_byte, '}');
> +        if (ret < 0)
> +            return ret;
> +
> +        if (!content.size || start == AV_NOPTS_VALUE ||
> +            duration == AV_NOPTS_VALUE)
> +            return AVERROR_INVALIDDATA;
> +        pkt = ff_subtitles_queue_insert(subs, content.str, content.len, 0);
> +        if (!pkt)
> +            return AVERROR(ENOMEM);
> +        pkt->pos      = pos;
> +        pkt->dts      =
> +        pkt->pts      = start;
> +        pkt->duration = duration;
> +        av_bprint_finalize(&content, NULL);
> +
> +        skip_spaces(pb, &cur_byte);
> +        if (cur_byte != ',')
> +            break;
> +        next_byte(pb, &cur_byte);
> +    }
> +    ret = expect_byte(pb, &cur_byte, ']');
> +    if (ret < 0)
> +        return ret;
> +    ret = expect_byte(pb, &cur_byte, '}');
> +    if (ret < 0)
> +        return ret;
> +    skip_spaces(pb, &cur_byte);
> +    if (cur_byte != AVERROR_EOF)
> +        return ERR_CODE(cur_byte);
> +    return 0;
> +}
> +
> +static av_cold int json_captions_read_header(AVFormatContext *avf)
> +{
> +    JSONCaptionsDemuxer *jc = avf->priv_data;
> +    AVStream *st;
> +    int ret, i;
> +    AVPacket *last;
> +
> +    ret = parse_file(avf->pb, &jc->subs);
> +    if (ret < 0) {
> +        if (ret == AVERROR_INVALIDDATA)
> +            av_log(avf, AV_LOG_ERROR, "Syntax error near offset %"PRId64".\n",
> +                   avio_tell(avf->pb));
> +        ff_subtitles_queue_clean(&jc->subs);
> +        return ret;
> +    }
> +    ff_subtitles_queue_finalize(&jc->subs);
> +    for (i = 0; i < jc->subs.nb_subs; i++)
> +        jc->subs.subs[i].pts += jc->start_time;
> +
> +    last = &jc->subs.subs[jc->subs.nb_subs - 1];
> +    st = avformat_new_stream(avf, NULL);
> +    if (!st)
> +        return AVERROR(ENOMEM);
> +    st->codec->codec_type     = AVMEDIA_TYPE_SUBTITLE;
> +    st->codec->codec_id       = CODEC_ID_TEXT;

Hopefully there is no TED talks with SubRip markup :)

> +    avpriv_set_pts_info(st, 64, 1, 1000);
> +    st->probe_packets = 0;
> +    st->start_time    = 0;
> +    st->duration      = last->pts + last->duration;
> +    st->cur_dts       = 0;
> +
> +    return 0;
> +

nit: trailing \n

> +}
> +
> +static int json_captions_read_packet(AVFormatContext *avf, AVPacket *packet)
> +{
> +    JSONCaptionsDemuxer *jc = avf->priv_data;
> +
> +    return ff_subtitles_queue_read_packet(&jc->subs, packet);
> +}
> +
> +static int json_captions_read_close(AVFormatContext *avf)
> +{
> +    JSONCaptionsDemuxer *jc = avf->priv_data;
> +
> +    ff_subtitles_queue_clean(&jc->subs);
> +    return 0;
> +}
> +
> +static av_cold int json_captions_read_probe(AVProbeData *p)
> +{
> +    FFDemuxSubtitlesQueue subs = { 0 };
> +    AVIOContext *input = avio_alloc_context(p->buf, p->buf_size, 0,
> +                                            NULL, NULL, NULL, NULL);
> +    int ret;
> +
> +    if (!input)
> +        return 0;
> +    ret = parse_file(input, &subs);

Too bad we have to parse the whole file two times. I'm a bit concerned by
the time this probing function will take; it's likely it will slow down
auto detection of other formats. I'd suggest to just browse the probe data
and check if the stream makes sense. You could do this when second
parse_file parameter is NULL for instance.

> +    ret = !ret || avio_tell(input) == p->buf_size ?
> +          subs.nb_subs > 1 ? AVPROBE_SCORE_MAX : AVPROBE_SCORE_MAX / 2 : 0;

please make this a bit more readable

> +    ff_subtitles_queue_clean(&subs);
> +    av_free(input);
> +    return ret;
> +}
> +
> +AVInputFormat ff_jsoncaptions_demuxer = {
> +    .name           = "json_captions",
> +    .long_name      = NULL_IF_CONFIG_SMALL("JSON captions (used in TED talks)"),
> +    .priv_data_size = sizeof(JSONCaptionsDemuxer),
> +    .read_header    = json_captions_read_header,
> +    .read_packet    = json_captions_read_packet,
> +    .read_close     = json_captions_read_close,
> +    .read_probe     = json_captions_read_probe,
> +    .priv_class     = &json_captions_demuxer_class,
> +};

The rest looks OK

-- 
Clément B.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 490 bytes
Desc: not available
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20121120/e79d4698/attachment.asc>