[FFmpeg-devel] [PATCH 3/3] lavf/movenc: support iTunes cover art

Sun Jul 28 17:28:30 CEST 2013

On Sat, Jul 27, 2013 at 09:37:25PM +0200, Matthieu Bouron wrote:
> 
> BTW, updated patch attached (fixing an indentation mistake + return
> immediately after storing attached pic in mov_write_packet function).

Updated and rebased patch attached (take psp check into account + fix a
warning).

[...]
-------------- next part --------------
>From 5180daf02567e91f47190e3ef33a5b792db68f62 Mon Sep 17 00:00:00 2001
From: Matthieu Bouron <matthieu.bouron at gmail.com>
Date: Thu, 27 Jun 2013 18:12:50 +0200
Subject: [PATCH] lavf/movenc: support iTunes cover art

Video streams with AV_DISPOSITON_ATTACHED_PIC will be used as cover arts
and won't be muxed as normal tracks in the resulting file.
---
 libavformat/movenc.c     | 200 +++++++++++++++++++++++++++++++++++++++++------
 libavformat/movenc.h     |  10 ++-
 libavformat/movenchint.c |  14 +++-
 3 files changed, 195 insertions(+), 29 deletions(-)

diff --git a/libavformat/movenc.c b/libavformat/movenc.c
index 42ec3f2..08bdbc5 100644
--- a/libavformat/movenc.c
+++ b/libavformat/movenc.c
@@ -104,6 +104,15 @@ static int is_co64_required(const MOVTrack *track)
     return 0;
 }
 
+static int mov_stream_is_apic(MOVMuxContext *mov, AVStream *st)
+{
+    if ((mov->mode & MODE_3GP) || (mov->mode & MODE_MOV))
+        return 0;
+    if (st->disposition & AV_DISPOSITION_ATTACHED_PIC)
+        return 1;
+    return 0;
+}
+
 /* Chunk offset atom */
 static int mov_write_stco_tag(AVIOContext *pb, MOVTrack *track)
 {
@@ -2064,6 +2073,46 @@ static int mov_write_int8_metadata(AVFormatContext *s, AVIOContext *pb,
     return size;
 }
 
+static int mov_write_covr(AVFormatContext *s, AVIOContext *pb, AVPacketList *covers)
+{
+    int64_t pos = avio_tell(pb);
+
+    avio_wb32(pb, 0);
+    ffio_wfourcc(pb, "covr");
+
+    while (covers) {
+        int type;
+        AVPacket *pkt = &covers->pkt;
+        enum AVCodecID codec_id = s->streams[pkt->stream_index]->codec->codec_id;
+
+        switch(codec_id) {
+        case CODEC_ID_MJPEG:
+            type = 0xD;
+            break;
+        case CODEC_ID_PNG:
+            type = 0xE;
+            break;
+        case CODEC_ID_BMP:
+            type = 0x1B;
+            break;
+        default:
+            covers = covers->next;
+            av_log(s, AV_LOG_ERROR, "unsupported codec %s for cover, skipping", s->streams[pkt->stream_index]->codec->codec_name);
+            continue;
+        }
+
+        avio_wb32(pb, 16 + pkt->size);
+        ffio_wfourcc(pb, "data");
+        avio_wb32(pb, type);
+        avio_wb32(pb , 0);
+        avio_write(pb, pkt->data, pkt->size);
+
+        covers = covers->next;
+    }
+
+    return update_size(pb, pos);
+}
+
 /* iTunes meta data list */
 static int mov_write_ilst_tag(AVIOContext *pb, MOVMuxContext *mov,
                               AVFormatContext *s)
@@ -2093,6 +2142,10 @@ static int mov_write_ilst_tag(AVIOContext *pb, MOVMuxContext *mov,
     mov_write_int8_metadata  (s, pb, "stik",    "media_type",1);
     mov_write_int8_metadata  (s, pb, "hdvd",    "hd_video",  1);
     mov_write_int8_metadata  (s, pb, "pgap",    "gapless_playback",1);
+
+    if (mov->covers)
+        mov_write_covr(s, pb, mov->covers);
+
     mov_write_trkn_tag(pb, mov, s);
     mov_write_tmpo_tag(pb, s);
     return update_size(pb, pos);
@@ -2197,7 +2250,7 @@ static int mov_write_udta_tag(AVIOContext *pb, MOVMuxContext *mov,
     int i, ret, size;
     uint8_t *buf;
 
-    for (i = 0; i < s->nb_streams; i++)
+    for (i = 0; i < mov->nb_input_streams; i++)
         if (mov->tracks[i].enc->flags & CODEC_FLAG_BITEXACT) {
             return 0;
         }
@@ -2340,7 +2393,7 @@ static int mov_write_moov_tag(AVIOContext *pb, MOVMuxContext *mov,
     }
 
     if (mov->chapter_track)
-        for (i = 0; i < s->nb_streams; i++) {
+        for (i = 0; i < mov->nb_input_streams; i++) {
             mov->tracks[i].tref_tag = MKTAG('c','h','a','p');
             mov->tracks[i].tref_id  = mov->tracks[mov->chapter_track].track_id;
         }
@@ -2365,14 +2418,14 @@ static int mov_write_moov_tag(AVIOContext *pb, MOVMuxContext *mov,
         mov_write_iods_tag(pb, mov);
     for (i = 0; i < mov->nb_streams; i++) {
         if (mov->tracks[i].entry > 0 || mov->flags & FF_MOV_FLAG_FRAGMENT) {
-            if (i < s->nb_streams){
+            if (i < mov->nb_input_streams) {
                 int codec_type= s->streams[i]->codec->codec_type;
                 if (codec_type==AVMEDIA_TYPE_AUDIO || codec_type==AVMEDIA_TYPE_SUBTITLE){
                     mov->tracks[i].secondary= not_first[codec_type];
                     not_first[codec_type]= 1;
                 }
             }
-            mov_write_trak_tag(pb, mov, &(mov->tracks[i]), i < s->nb_streams ? s->streams[i] : NULL);
+            mov_write_trak_tag(pb, mov, &(mov->tracks[i]), i < mov->nb_input_streams ? s->streams[i] : NULL);
         }
     }
     if (mov->flags & FF_MOV_FLAG_FRAGMENT)
@@ -2800,7 +2853,8 @@ static int mov_write_ftyp_tag(AVIOContext *pb, AVFormatContext *s)
 
     for (i = 0; i < s->nb_streams; i++) {
         AVStream *st = s->streams[i];
-        if (st->codec->codec_type == AVMEDIA_TYPE_VIDEO)
+        if (st->codec->codec_type == AVMEDIA_TYPE_VIDEO &&
+            !mov_stream_is_apic(mov, st))
             has_video = 1;
         if (st->codec->codec_id == AV_CODEC_ID_H264)
             has_h264 = 1;
@@ -3007,6 +3061,18 @@ static int get_moov_size(AVFormatContext *s)
     return ret;
 }
 
+int ff_mov_get_track_index(MOVMuxContext *mov, int stream_index)
+{
+    int i;
+
+    for (i = 0; i < mov->nb_streams; i++) {
+        MOVTrack *track = &mov->tracks[i];
+        if (track->stream_index == stream_index)
+            return i;
+    }
+    return -1;
+}
+
 static int mov_flush_fragment(AVFormatContext *s)
 {
     MOVMuxContext *mov = s->priv_data;
@@ -3136,12 +3202,19 @@ int ff_mov_write_packet(AVFormatContext *s, AVPacket *pkt)
 {
     MOVMuxContext *mov = s->priv_data;
     AVIOContext *pb = s->pb;
-    MOVTrack *trk = &mov->tracks[pkt->stream_index];
-    AVCodecContext *enc = trk->enc;
+    int trk_index = ff_mov_get_track_index(mov, pkt->stream_index);
+    MOVTrack *trk;
+    AVCodecContext *enc;
     unsigned int samples_in_chunk = 0;
     int size = pkt->size;
     uint8_t *reformatted_data = NULL;
 
+    if (trk_index < 0)
+        return 0;
+
+    trk = &mov->tracks[trk_index];
+    enc = trk->enc;
+
     if (mov->flags & FF_MOV_FLAG_FRAGMENT) {
         int ret;
         if (mov->fragments > 0) {
@@ -3289,14 +3362,21 @@ int ff_mov_write_packet(AVFormatContext *s, AVPacket *pkt)
 static int mov_write_single_packet(AVFormatContext *s, AVPacket *pkt)
 {
         MOVMuxContext *mov = s->priv_data;
-        MOVTrack *trk = &mov->tracks[pkt->stream_index];
-        AVCodecContext *enc = trk->enc;
+        int trk_index = ff_mov_get_track_index(mov, pkt->stream_index);
+        MOVTrack *trk;
+        AVCodecContext *enc;
         int64_t frag_duration = 0;
         int size = pkt->size;
 
         if (!pkt->size)
             return 0;             /* Discard 0 sized packets */
 
+        if (trk_index < 0)
+            return 0;
+
+        trk = &mov->tracks[trk_index];
+        enc = trk->enc;
+
         if (trk->entry && pkt->stream_index < s->nb_streams)
             frag_duration = av_rescale_q(pkt->dts - trk->cluster[0].dts,
                                          s->streams[pkt->stream_index]->time_base,
@@ -3343,6 +3423,7 @@ static int mov_write_packet(AVFormatContext *s, AVPacket *pkt)
     } else {
         int i;
         MOVMuxContext *mov = s->priv_data;
+        AVStream *st = s->streams[pkt->stream_index];
 
         if (!pkt->size) return 0; /* Discard 0 sized packets */
 
@@ -3378,24 +3459,56 @@ static int mov_write_packet(AVFormatContext *s, AVPacket *pkt)
             }
         }
 
+        if (mov_stream_is_apic(mov, st)) {
+
+            /* warn only once */
+            if (st->nb_frames == 1) {
+                av_log(s, AV_LOG_WARNING, "Got more than one picture in stream %d,"
+                    " ignoring.\n", pkt->stream_index);
+            }
+
+            if (st->nb_frames == 0) {
+                int ret;
+                AVPacketList *last, *covers = av_mallocz(sizeof(AVPacketList));
+                if (!covers)
+                    return AVERROR(ENOMEM);
+
+                if ((ret = av_copy_packet(&covers->pkt, pkt)) < 0) {
+                    av_freep(&covers);
+                    return ret;
+                }
+
+                if (!mov->covers)
+                    mov->covers = covers;
+                else {
+                    last = mov->covers;
+                    while (last->next)
+                        last = last->next;
+                    last->next = covers;
+                }
+            }
+            return 0;
+        }
+
         return mov_write_single_packet(s, pkt);
     }
 }
 
 // QuickTime chapters involve an additional text track with the chapter names
 // as samples, and a tref pointing from the other tracks to the chapter one.
-static void mov_create_chapter_track(AVFormatContext *s, int tracknum)
+static void mov_create_chapter_track(AVFormatContext *s, int tracknum, int stream_index)
 {
     AVIOContext *pb;
 
     MOVMuxContext *mov = s->priv_data;
     MOVTrack *track = &mov->tracks[tracknum];
-    AVPacket pkt = { .stream_index = tracknum, .flags = AV_PKT_FLAG_KEY };
+    AVPacket pkt = { .stream_index = stream_index, .flags = AV_PKT_FLAG_KEY };
     int i, len;
 
     track->mode = mov->mode;
     track->tag = MKTAG('t','e','x','t');
     track->timescale = MOV_TIMESCALE;
+    track->stream_index = stream_index;
     track->enc = avcodec_alloc_context3(NULL);
     track->enc->codec_type = AVMEDIA_TYPE_SUBTITLE;
 
@@ -3463,14 +3576,14 @@ static void mov_create_chapter_track(AVFormatContext *s, int tracknum)
     }
 }
 
-static int mov_create_timecode_track(AVFormatContext *s, int index, int src_index, const char *tcstr)
+static int mov_create_timecode_track(AVFormatContext *s, int index, int stream_index, int src_index, const char *tcstr)
 {
     int ret;
     MOVMuxContext *mov  = s->priv_data;
     MOVTrack *track     = &mov->tracks[index];
     AVStream *src_st    = s->streams[src_index];
     AVTimecode tc;
-    AVPacket pkt    = {.stream_index = index, .flags = AV_PKT_FLAG_KEY, .size = 4};
+    AVPacket pkt    = {.stream_index = stream_index, .flags = AV_PKT_FLAG_KEY, .size = 4};
     AVRational rate = {src_st->codec->time_base.den, src_st->codec->time_base.num};
 
     /* if the codec time base makes no sense, try to fallback on stream frame rate */
@@ -3489,6 +3602,7 @@ static int mov_create_timecode_track(AVFormatContext *s, int index, int src_inde
     track->mode      = mov->mode;
     track->tag       = MKTAG('t','m','c','d');
     track->src_track = src_index;
+    track->stream_index = stream_index;
     track->timescale = mov->tracks[src_index].timescale;
     if (tc.flags & AV_TIMECODE_FLAG_DROPFRAME)
         track->timecode_flags |= MOV_TIMECODE_FLAG_DROPFRAME;
@@ -3512,7 +3626,8 @@ static int mov_write_header(AVFormatContext *s)
     AVIOContext *pb = s->pb;
     MOVMuxContext *mov = s->priv_data;
     AVDictionaryEntry *t, *global_tcr = av_dict_get(s->metadata, "timecode", NULL, 0);
-    int i, hint_track = 0, tmcd_track = 0;
+    int i, stream_index = -1, hint_track = 0, hint_stream_index = 0;
+    int tmcd_track = 0, tmcd_stream_index = 0;
 
     /* Set the FRAGMENT flag if any of the fragmentation methods are
      * enabled. */
@@ -3560,7 +3675,8 @@ static int mov_write_header(AVFormatContext *s)
         int video_streams_nb = 0, audio_streams_nb = 0, other_streams_nb = 0;
         for (i = 0; i < s->nb_streams; i++) {
             AVStream *st = s->streams[i];
-            if (st->codec->codec_type == AVMEDIA_TYPE_VIDEO)
+            if (st->codec->codec_type == AVMEDIA_TYPE_VIDEO &&
+                !mov_stream_is_apic(mov, st))
                 video_streams_nb++;
             else if (st->codec->codec_type == AVMEDIA_TYPE_AUDIO)
                 audio_streams_nb++;
@@ -3575,29 +3691,45 @@ static int mov_write_header(AVFormatContext *s)
         mov_write_uuidprof_tag(pb, s);
     }
 
-    mov->nb_streams = s->nb_streams;
-    if (mov->mode & (MODE_MOV|MODE_IPOD) && s->nb_chapters)
+    mov->next_stream_index = mov->nb_streams = s->nb_streams;
+
+    for (i = 0; i < s->nb_streams; i++) {
+        AVStream *st = s->streams[i];
+        if (mov_stream_is_apic(mov, st))
+            mov->nb_streams--;
+    }
+
+    mov->nb_input_streams = mov->nb_streams;
+
+    if (mov->mode & (MODE_MOV|MODE_IPOD) && s->nb_chapters) {
         mov->chapter_track = mov->nb_streams++;
+        mov->chapter_stream_index = mov->next_stream_index++;
+    }
 
     if (mov->flags & FF_MOV_FLAG_RTP_HINT) {
         /* Add hint tracks for each audio and video stream */
         hint_track = mov->nb_streams;
+        hint_stream_index = mov->next_stream_index++;
         for (i = 0; i < s->nb_streams; i++) {
             AVStream *st = s->streams[i];
-            if (st->codec->codec_type == AVMEDIA_TYPE_VIDEO ||
+            if ((st->codec->codec_type == AVMEDIA_TYPE_VIDEO &&
+                 !mov_stream_is_apic(mov, st)) ||
                 st->codec->codec_type == AVMEDIA_TYPE_AUDIO) {
                 mov->nb_streams++;
+                mov->next_stream_index++;
             }
         }
     }
 
     if (mov->mode == MODE_MOV) {
         tmcd_track = mov->nb_streams;
+        tmcd_stream_index = mov->next_stream_index;
 
         /* +1 tmcd track for each video stream with a timecode */
         for (i = 0; i < s->nb_streams; i++) {
             AVStream *st = s->streams[i];
-            if (st->codec->codec_type == AVMEDIA_TYPE_VIDEO &&
+            if ((st->codec->codec_type == AVMEDIA_TYPE_VIDEO &&
+                !mov_stream_is_apic(mov, st)) &&
                 (global_tcr || av_dict_get(st->metadata, "timecode", NULL, 0)))
                 mov->nb_meta_tmcd++;
         }
@@ -3615,6 +3747,7 @@ static int mov_write_header(AVFormatContext *s)
         }
 
         mov->nb_streams += mov->nb_meta_tmcd;
+        mov->next_stream_index += mov->nb_meta_tmcd;
     }
 
     mov->tracks = av_mallocz(mov->nb_streams * sizeof(*mov->tracks));
@@ -3623,9 +3756,17 @@ static int mov_write_header(AVFormatContext *s)
 
     for (i = 0; i < s->nb_streams; i++) {
         AVStream *st= s->streams[i];
-        MOVTrack *track= &mov->tracks[i];
+        MOVTrack *track;
         AVDictionaryEntry *lang = av_dict_get(st->metadata, "language", NULL,0);
 
+        /* skip cover art streams */
+        if (mov_stream_is_apic(mov, st))
+            continue;
+
+        stream_index++;
+        track = &mov->tracks[stream_index];
+        track->stream_index = i;
+
         track->enc = st->codec;
         track->language = ff_mov_iso639_to_lang(lang?lang->value:"und", mov->mode!=MODE_MOV);
         if (track->language < 0)
@@ -3734,17 +3875,19 @@ static int mov_write_header(AVFormatContext *s)
         mov->time += 0x7C25B080; // 1970 based -> 1904 based
 
     if (mov->chapter_track)
-        mov_create_chapter_track(s, mov->chapter_track);
+        mov_create_chapter_track(s, mov->chapter_track, mov->chapter_stream_index);
 
     if (mov->flags & FF_MOV_FLAG_RTP_HINT) {
         /* Initialize the hint tracks for each audio and video stream */
         for (i = 0; i < s->nb_streams; i++) {
             AVStream *st = s->streams[i];
-            if (st->codec->codec_type == AVMEDIA_TYPE_VIDEO ||
+            if ((st->codec->codec_type == AVMEDIA_TYPE_VIDEO &&
+                 !mov_stream_is_apic(mov, st)) ||
                 st->codec->codec_type == AVMEDIA_TYPE_AUDIO) {
-                if (ff_mov_init_hinting(s, hint_track, i) < 0)
+                if (ff_mov_init_hinting(s, hint_track, hint_stream_index, i) < 0)
                     goto error;
                 hint_track++;
+                hint_stream_index++;
             }
         }
     }
@@ -3760,9 +3903,10 @@ static int mov_write_header(AVFormatContext *s)
                     t = av_dict_get(st->metadata, "timecode", NULL, 0);
                 if (!t)
                     continue;
-                if (mov_create_timecode_track(s, tmcd_track, i, t->value) < 0)
+                if (mov_create_timecode_track(s, tmcd_track, tmcd_stream_index, i, t->value) < 0)
                     goto error;
                 tmcd_track++;
+                tmcd_stream_index++;
             }
         }
     }
@@ -3881,6 +4025,7 @@ static int mov_write_trailer(AVFormatContext *s)
 {
     MOVMuxContext *mov = s->priv_data;
     AVIOContext *pb = s->pb;
+    AVPacketList *covers = mov->covers;
     int64_t moov_pos;
     int res = 0;
     int i;
@@ -3971,6 +4116,13 @@ static int mov_write_trailer(AVFormatContext *s)
 
     av_freep(&mov->tracks);
 
+    while (covers) {
+        AVPacketList *next = covers->next;
+        av_free_packet(&covers->pkt);
+        av_freep(&covers);
+        covers = next;
+    }
+
     return res;
 }
 
diff --git a/libavformat/movenc.h b/libavformat/movenc.h
index a6571d5..56709fd 100644
--- a/libavformat/movenc.h
+++ b/libavformat/movenc.h
@@ -141,6 +141,8 @@ typedef struct MOVTrack {
         int     packet_entry;
         int     slices;
     } vc1_info;
+
+    int stream_index;
 } MOVTrack;
 
 typedef struct MOVMuxContext {
@@ -172,6 +174,11 @@ typedef struct MOVMuxContext {
 
     int use_editlist;
     int video_track_timescale;
+
+    AVPacketList *covers;
+    int nb_input_streams; ///< number of input stream used
+    int next_stream_index; ///< next stream index available
+    int chapter_stream_index; ///< chapter stream index
 } MOVMuxContext;
 
 #define FF_MOV_FLAG_RTP_HINT 1
@@ -185,10 +192,11 @@ typedef struct MOVMuxContext {
 
 int ff_mov_write_packet(AVFormatContext *s, AVPacket *pkt);
 
-int ff_mov_init_hinting(AVFormatContext *s, int index, int src_index);
+int ff_mov_init_hinting(AVFormatContext *s, int index, int stream_index, int src_index);
 int ff_mov_add_hinted_packet(AVFormatContext *s, AVPacket *pkt,
                              int track_index, int sample,
                              uint8_t *sample_data, int sample_size);
 void ff_mov_close_hinting(MOVTrack *track);
+int ff_mov_get_track_index(MOVMuxContext *mov, int stream_index);
 
 #endif /* AVFORMAT_MOVENC_H */
diff --git a/libavformat/movenchint.c b/libavformat/movenchint.c
index 943680e..bb5fd38 100644
--- a/libavformat/movenchint.c
+++ b/libavformat/movenchint.c
@@ -20,22 +20,28 @@
  */
 
 #include "movenc.h"
+#include "libavutil/avassert.h"
 #include "libavutil/intreadwrite.h"
 #include "internal.h"
 #include "rtpenc_chain.h"
 #include "avio_internal.h"
 #include "rtp.h"
 
-int ff_mov_init_hinting(AVFormatContext *s, int index, int src_index)
+int ff_mov_init_hinting(AVFormatContext *s, int index, int stream_index, int src_index)
 {
     MOVMuxContext *mov  = s->priv_data;
     MOVTrack *track     = &mov->tracks[index];
-    MOVTrack *src_track = &mov->tracks[src_index];
     AVStream *src_st    = s->streams[src_index];
+    int src_track_index = ff_mov_get_track_index(mov, src_index);
+    MOVTrack *src_track;
     int ret = AVERROR(ENOMEM);
 
+    av_assert0(src_track_index >= 0);
+    src_track = &mov->tracks[src_track_index];
+
     track->tag = MKTAG('r','t','p',' ');
-    track->src_track = src_index;
+    track->src_track = src_track_index;
+    track->stream_index = stream_index;
 
     track->enc = avcodec_alloc_context3(NULL);
     if (!track->enc)
@@ -445,7 +451,7 @@ int ff_mov_add_hinted_packet(AVFormatContext *s, AVPacket *pkt,
     hint_pkt.size = size = avio_close_dyn_buf(hintbuf, &buf);
     hint_pkt.data = buf;
     hint_pkt.pts  = hint_pkt.dts;
-    hint_pkt.stream_index = track_index;
+    hint_pkt.stream_index = trk->stream_index;
     if (pkt->flags & AV_PKT_FLAG_KEY)
         hint_pkt.flags |= AV_PKT_FLAG_KEY;
     if (count > 0)
-- 
1.7.12.4 (Apple Git-37)