[FFmpeg-devel] [PATCH] avformat: Implement subtitle charenc guessing

Rodger Combs rodger.combs at gmail.com
Fri Dec 12 07:05:27 CET 2014


This also moves general charenc conversion from avcodec to avformat;
the version in avcodec is left, but renamed; I'm not sure if that's
the optimal solution.

The documentation could probably use some improvements, and a few more
options could be added to ENCA.

This very simply prefers libguess over ENCA, and ENCA over uchardet, but
will fall back on a less-preferred guess if something decodes wrong, and will
drop illegal sequences in iconv if all else fails.

It'd be possible to have ffmpeg.c present a UI if multiple guesses are
returned, and other library consumers could do the same.
---
 configure                   |  15 +++
 libavcodec/options_table.h  |   2 +-
 libavformat/aqtitledec.c    |   2 +
 libavformat/assdec.c        |   2 +
 libavformat/avformat.h      |  50 +++++++++
 libavformat/jacosubdec.c    |   2 +
 libavformat/microdvddec.c   |   2 +
 libavformat/mpl2dec.c       |   2 +
 libavformat/mpsubdec.c      |   2 +
 libavformat/options_table.h |   7 ++
 libavformat/pjsdec.c        |   2 +
 libavformat/realtextdec.c   |   2 +
 libavformat/samidec.c       |   2 +
 libavformat/srtdec.c        |   2 +
 libavformat/stldec.c        |   2 +
 libavformat/subtitles.c     | 262 +++++++++++++++++++++++++++++++++++++++++++-
 libavformat/subtitles.h     |   1 +
 libavformat/subviewer1dec.c |   2 +
 libavformat/subviewerdec.c  |   2 +
 libavformat/utils.c         |   2 +
 libavformat/vplayerdec.c    |   2 +
 libavformat/webvttdec.c     |   2 +
 22 files changed, 365 insertions(+), 4 deletions(-)

diff --git a/configure b/configure
index e2e3619..a5a9f9b 100755
--- a/configure
+++ b/configure
@@ -199,6 +199,9 @@ External library support:
   --enable-gnutls          enable gnutls, needed for https support
                            if openssl is not used [no]
   --disable-iconv          disable iconv [autodetect]
+  --disable-libguess       disable libguess [autodetect]
+  --disable-uchardet       disable universalchardet [autodetect]
+  --enable-enca            disable enca [no]
   --enable-ladspa          enable LADSPA audio filtering [no]
   --enable-libaacplus      enable AAC+ encoding via libaacplus [no]
   --enable-libass          enable libass subtitles rendering,
@@ -1342,6 +1345,9 @@ EXTERNAL_LIBRARY_LIST="
     frei0r
     gnutls
     iconv
+    libguess
+    uchardet
+    enca
     ladspa
     libaacplus
     libass
@@ -4358,6 +4364,7 @@ die_license_disabled gpl libxavs
 die_license_disabled gpl libxvid
 die_license_disabled gpl libzvbi
 die_license_disabled gpl x11grab
+die_license_disabled gpl enca
 
 die_license_disabled nonfree libaacplus
 die_license_disabled nonfree libfaac
@@ -5117,6 +5124,14 @@ enabled vdpau && enabled xlib &&
 # Funny iconv installations are not unusual, so check it after all flags have been set
 disabled iconv || check_func_headers iconv.h iconv || check_lib2 iconv.h iconv -liconv || disable iconv
 
+disabled iconv || disabled libguess || disable libguess && {
+    check_pkg_config libguess libguess.h libguess_determine_encoding && require_pkg_config libguess libguess.h libguess_determine_encoding && enable libguess;
+}
+disabled iconv || disabled uchardet || disable uchardet && {
+    check_pkg_config uchardet uchardet.h uchardet_new && require_pkg_config uchardet uchardet.h uchardet_new && enable uchardet;
+}
+enabled enca && check_func_headers enca.h enca_analyse || check_lib2 enca.h enca_analyse -lenca || die "ERROR: enca not found"
+
 enabled debug && add_cflags -g"$debuglevel" && add_asflags -g"$debuglevel"
 
 # add some useful compiler flags if supported
diff --git a/libavcodec/options_table.h b/libavcodec/options_table.h
index 1d5b078..93b3105 100644
--- a/libavcodec/options_table.h
+++ b/libavcodec/options_table.h
@@ -472,7 +472,7 @@ static const AVOption avcodec_options[] = {
 {"ka", "Karaoke",            0, AV_OPT_TYPE_CONST, {.i64 = AV_AUDIO_SERVICE_TYPE_KARAOKE },           INT_MIN, INT_MAX, A|E, "audio_service_type"},
 {"request_sample_fmt", "sample format audio decoders should prefer", OFFSET(request_sample_fmt), AV_OPT_TYPE_SAMPLE_FMT, {.i64=AV_SAMPLE_FMT_NONE}, -1, INT_MAX, A|D, "request_sample_fmt"},
 {"pkt_timebase", NULL, OFFSET(pkt_timebase), AV_OPT_TYPE_RATIONAL, {.dbl = 0 }, 0, INT_MAX, 0},
-{"sub_charenc", "set input text subtitles character encoding", OFFSET(sub_charenc), AV_OPT_TYPE_STRING, {.str = NULL}, CHAR_MIN, CHAR_MAX, S|D},
+{"sub_charenc_lavc", "set input text subtitles character encoding", OFFSET(sub_charenc), AV_OPT_TYPE_STRING, {.str = NULL}, CHAR_MIN, CHAR_MAX, S|D},
 {"sub_charenc_mode", "set input text subtitles character encoding mode", OFFSET(sub_charenc_mode), AV_OPT_TYPE_FLAGS, {.i64 = FF_SUB_CHARENC_MODE_AUTOMATIC}, -1, INT_MAX, S|D, "sub_charenc_mode"},
 {"do_nothing",  NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_SUB_CHARENC_MODE_DO_NOTHING},  INT_MIN, INT_MAX, S|D, "sub_charenc_mode"},
 {"auto",        NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_SUB_CHARENC_MODE_AUTOMATIC},   INT_MIN, INT_MAX, S|D, "sub_charenc_mode"},
diff --git a/libavformat/aqtitledec.c b/libavformat/aqtitledec.c
index 9508766..65aa3e4 100644
--- a/libavformat/aqtitledec.c
+++ b/libavformat/aqtitledec.c
@@ -55,6 +55,8 @@ static int aqt_read_header(AVFormatContext *s)
     int64_t pos = 0, frame = AV_NOPTS_VALUE;
     AVPacket *sub = NULL;
 
+    aqt->q.avctx = s;
+
     if (!st)
         return AVERROR(ENOMEM);
     avpriv_set_pts_info(st, 64, aqt->frame_rate.den, aqt->frame_rate.num);
diff --git a/libavformat/assdec.c b/libavformat/assdec.c
index c62e76f..958792b 100644
--- a/libavformat/assdec.c
+++ b/libavformat/assdec.c
@@ -114,6 +114,8 @@ static int ass_read_header(AVFormatContext *s)
     FFTextReader tr;
     ff_text_init_avio(s, &tr, s->pb);
 
+    ass->q.avctx = s;
+
     st = avformat_new_stream(s, NULL);
     if (!st)
         return AVERROR(ENOMEM);
diff --git a/libavformat/avformat.h b/libavformat/avformat.h
index 2e54ed1..8c5fa7e 100644
--- a/libavformat/avformat.h
+++ b/libavformat/avformat.h
@@ -1755,6 +1755,56 @@ typedef struct AVFormatContext {
      * - demuxing: Set by user.
      */
     uint8_t *dump_separator;
+
+    /**
+     * Character encoding of a subtitle file
+     * - encoding: unused
+     * - decoding: Set by user via AVOptions; may be changed after initialization
+     */
+    char *sub_charenc;
+
+    /**
+     * Array of guesses for the character encoding
+     * - encoding: unused
+     * - decoding: Set by demuxer
+     */
+    int nb_sub_charenc_guesses;
+    char **sub_charenc_guesses;
+
+    /**
+     * Language to pass to libguess for charenc detection.
+     * - encoding: unused
+     * - decoding: Set by user via AVOptions (NO direct access)
+     */
+    char *libguess_language;
+
+    /**
+     * Language to pass to libenca for charenc detection.
+     * - encoding: unused
+     * - decoding: Set by user via AVOptions (NO direct access)
+     */
+    char *enca_language;
+
+    /**
+     * Threshold parameter for libenca charenc detection.
+     * - encoding: unused
+     * - decoding: Set by user via AVOptions (NO direct access)
+     */
+    double enca_threshold;
+
+    /**
+     * Whether or not to check for multibyte charsets in libenca.
+     * - encoding: unused
+     * - decoding: Set by user via AVOptions (NO direct access)
+     */
+    int enca_multibyte;
+
+    /**
+     * Whether or not to let libenca return an ambiguous result.
+     * - encoding: unused
+     * - decoding: Set by user via AVOptions (NO direct access)
+     */
+    int enca_ambiguity;
 } AVFormatContext;
 
 int av_format_get_probe_score(const AVFormatContext *s);
diff --git a/libavformat/jacosubdec.c b/libavformat/jacosubdec.c
index 1ca0055..fa332fa 100644
--- a/libavformat/jacosubdec.c
+++ b/libavformat/jacosubdec.c
@@ -170,6 +170,8 @@ static int jacosub_read_header(AVFormatContext *s)
     st->codec->codec_type = AVMEDIA_TYPE_SUBTITLE;
     st->codec->codec_id   = AV_CODEC_ID_JACOSUB;
 
+    jacosub->q.avctx = s;
+
     jacosub->timeres = 30;
 
     av_bprint_init(&header, 1024+FF_INPUT_BUFFER_PADDING_SIZE, 4096);
diff --git a/libavformat/microdvddec.c b/libavformat/microdvddec.c
index ce3433c..5c3b48c 100644
--- a/libavformat/microdvddec.c
+++ b/libavformat/microdvddec.c
@@ -85,6 +85,8 @@ static int microdvd_read_header(AVFormatContext *s)
     char line_buf[MAX_LINESIZE];
     int has_real_fps = 0;
 
+    microdvd->q.avctx = s;
+
     if (!st)
         return AVERROR(ENOMEM);
 
diff --git a/libavformat/mpl2dec.c b/libavformat/mpl2dec.c
index 260b7be..fa431c3 100644
--- a/libavformat/mpl2dec.c
+++ b/libavformat/mpl2dec.c
@@ -77,6 +77,8 @@ static int mpl2_read_header(AVFormatContext *s)
     AVStream *st = avformat_new_stream(s, NULL);
     int res = 0;
 
+    mpl2->q.avctx = s;
+
     if (!st)
         return AVERROR(ENOMEM);
     avpriv_set_pts_info(st, 64, 1, 10);
diff --git a/libavformat/mpsubdec.c b/libavformat/mpsubdec.c
index eddc594..7bb08f9 100644
--- a/libavformat/mpsubdec.c
+++ b/libavformat/mpsubdec.c
@@ -61,6 +61,8 @@ static int mpsub_read_header(AVFormatContext *s)
     float multiplier = 100.0;
     float current_pts = 0;
 
+    mpsub->q.avctx = s;
+
     av_bprint_init(&buf, 0, AV_BPRINT_SIZE_UNLIMITED);
 
     while (!avio_feof(s->pb)) {
diff --git a/libavformat/options_table.h b/libavformat/options_table.h
index 40f1e0a..741bfb2 100644
--- a/libavformat/options_table.h
+++ b/libavformat/options_table.h
@@ -22,6 +22,7 @@
 #define AVFORMAT_OPTIONS_TABLE_H
 
 #include <limits.h>
+#include <float.h>  /* DBL_MAX */
 
 #include "libavutil/opt.h"
 #include "avformat.h"
@@ -99,6 +100,12 @@ static const AVOption avformat_options[] = {
 {"dump_separator", "set information dump field separator", OFFSET(dump_separator), AV_OPT_TYPE_STRING, {.str = ", "}, CHAR_MIN, CHAR_MAX, D|E},
 {"codec_whitelist", "List of decoders that are allowed to be used", OFFSET(codec_whitelist), AV_OPT_TYPE_STRING, { .str = NULL },  CHAR_MIN, CHAR_MAX, D },
 {"format_whitelist", "List of demuxers that are allowed to be used", OFFSET(format_whitelist), AV_OPT_TYPE_STRING, { .str = NULL },  CHAR_MIN, CHAR_MAX, D },
+{"sub_charenc", "subtitle character encoding", OFFSET(sub_charenc), AV_OPT_TYPE_STRING, { .str = NULL }, CHAR_MIN, CHAR_MAX, D },
+{"libguess_language", "Language parameter for libguess charenc detection", OFFSET(libguess_language), AV_OPT_TYPE_STRING, { .str = NULL }, CHAR_MIN, CHAR_MAX, D },
+{"enca_language",     "Language parameter for enca charenc detection",     OFFSET(enca_language),     AV_OPT_TYPE_STRING, { .str = NULL }, CHAR_MIN, CHAR_MAX, D },
+{"enca_threshold",    "Threshold parameter for enca charenc detection",    OFFSET(enca_threshold),    AV_OPT_TYPE_DOUBLE, { .dbl = 1.38 }, 1.0,      DBL_MAX,  D },
+{"enca_multibyte",    "Whether or not to allow enca to guess multibyte charsets", OFFSET(enca_multibyte), AV_OPT_TYPE_INT, { .i64 = 1 },   0,        1,        D },
+{"enca_ambiguity",    "Whether or not to allow enca to return ambiguous results", OFFSET(enca_ambiguity), AV_OPT_TYPE_INT, { .i64 = 1 },   0,        1,        D },
 {NULL},
 };
 
diff --git a/libavformat/pjsdec.c b/libavformat/pjsdec.c
index 5129b70..252e9d9 100644
--- a/libavformat/pjsdec.c
+++ b/libavformat/pjsdec.c
@@ -67,6 +67,8 @@ static int pjs_read_header(AVFormatContext *s)
     AVStream *st = avformat_new_stream(s, NULL);
     int res = 0;
 
+    pjs->q.avctx = s;
+
     if (!st)
         return AVERROR(ENOMEM);
     avpriv_set_pts_info(st, 64, 1, 10);
diff --git a/libavformat/realtextdec.c b/libavformat/realtextdec.c
index fff85d6..d20f0c5 100644
--- a/libavformat/realtextdec.c
+++ b/libavformat/realtextdec.c
@@ -67,6 +67,8 @@ static int realtext_read_header(AVFormatContext *s)
     FFTextReader tr;
     ff_text_init_avio(s, &tr, s->pb);
 
+    rt->q.avctx = s;
+
     if (!st)
         return AVERROR(ENOMEM);
     avpriv_set_pts_info(st, 64, 1, 100);
diff --git a/libavformat/samidec.c b/libavformat/samidec.c
index 948e1ed..968f506 100644
--- a/libavformat/samidec.c
+++ b/libavformat/samidec.c
@@ -56,6 +56,8 @@ static int sami_read_header(AVFormatContext *s)
     FFTextReader tr;
     ff_text_init_avio(s, &tr, s->pb);
 
+    sami->q.avctx = s;
+
     if (!st)
         return AVERROR(ENOMEM);
     avpriv_set_pts_info(st, 64, 1, 1000);
diff --git a/libavformat/srtdec.c b/libavformat/srtdec.c
index b35e50f..3187490 100644
--- a/libavformat/srtdec.c
+++ b/libavformat/srtdec.c
@@ -89,6 +89,8 @@ static int srt_read_header(AVFormatContext *s)
     FFTextReader tr;
     ff_text_init_avio(s, &tr, s->pb);
 
+    srt->q.avctx = s;
+
     if (!st)
         return AVERROR(ENOMEM);
     avpriv_set_pts_info(st, 64, 1, 1000);
diff --git a/libavformat/stldec.c b/libavformat/stldec.c
index b84c7e9..5d96737 100644
--- a/libavformat/stldec.c
+++ b/libavformat/stldec.c
@@ -74,6 +74,8 @@ static int stl_read_header(AVFormatContext *s)
     STLContext *stl = s->priv_data;
     AVStream *st = avformat_new_stream(s, NULL);
 
+    stl->q.avctx = s;
+
     if (!st)
         return AVERROR(ENOMEM);
     avpriv_set_pts_info(st, 64, 1, 100);
diff --git a/libavformat/subtitles.c b/libavformat/subtitles.c
index 67624fc..e953080 100644
--- a/libavformat/subtitles.c
+++ b/libavformat/subtitles.c
@@ -21,9 +21,23 @@
 #include "avformat.h"
 #include "subtitles.h"
 #include "avio_internal.h"
+#include "internal.h"
 #include "libavutil/avassert.h"
 #include "libavutil/avstring.h"
 
+#if CONFIG_ICONV
+# include <iconv.h>
+#endif
+#if CONFIG_LIBGUESS
+# include <libguess.h>
+#endif
+#if CONFIG_ENCA
+# include <enca.h>
+#endif
+#if CONFIG_UCHARDET
+# include <uchardet.h>
+#endif
+
 void ff_text_init_avio(void *s, FFTextReader *r, AVIOContext *pb)
 {
     int i;
@@ -166,26 +180,268 @@ static int cmp_pkt_sub_pos_ts(const void *a, const void *b)
     return s1->pos > s2->pos ? 1 : -1;
 }
 
+/**
+ * Add a character encoding guess to an AVFormatContext's list
+ *
+ * @param avctx the context to add to
+ * @param enc   the encoding name to add
+ *
+ * A copy is added, so the original string should be free()d if necessary.
+ * If the same encoding name is already present, it isn't added again.
+ * If NULL or an empty string is passed, it's not added.
+ */
+static void add_charenc(AVFormatContext *avctx, const char *enc)
+{
+    char *copy;
+
+    if (!enc || !enc[0])
+        return;
+
+    for (unsigned i = 0; i < avctx->nb_sub_charenc_guesses; i++)
+        if (!strcmp(avctx->sub_charenc_guesses[i], enc))
+            return;
+
+    copy = av_strdup(enc);
+    if (!copy)
+        return;
+
+    dynarray_add(&avctx->sub_charenc_guesses, &avctx->nb_sub_charenc_guesses,
+                 copy);
+}
+
+/**
+ * Finish an FFDemuxSubtitlesQueue and prepare it for reading
+ *
+ * @param q the queue to finish
+ *
+ * This sorts packets by position and/or timestamp, adjusts durations for
+ * formats that don't set them, and (if enabled) builds a text buffer for the
+ * charenc detectors.
+ * If enabled, it then checks the buffer with each available charenc detector,
+ * builds a list of guesses, and sets the AVFormatContext's encoding to its
+ * best candidate.
+ */
 void ff_subtitles_queue_finalize(FFDemuxSubtitlesQueue *q)
 {
     int i;
+    char *charenc_buf = NULL;
+    int charenc_buf_size = 0, charenc_buf_len = 0;
+    AVFormatContext *avctx = q->avctx;
+    // Whether or not we're doing charenc detection here
+    int detection = avctx && avctx->sub_charenc &&
+                    !strcmp(avctx->sub_charenc, "auto");
 
     qsort(q->subs, q->nb_subs, sizeof(*q->subs),
           q->sort == SUB_SORT_TS_POS ? cmp_pkt_sub_ts_pos
                                      : cmp_pkt_sub_pos_ts);
-    for (i = 0; i < q->nb_subs; i++)
+    for (i = 0; i < q->nb_subs; i++) {
         if (q->subs[i].duration == -1 && i < q->nb_subs - 1)
             q->subs[i].duration = q->subs[i + 1].pts - q->subs[i].pts;
+
+        if (detection) {
+            char *newbuf = av_fast_realloc(charenc_buf, &charenc_buf_size,
+                                           charenc_buf_len + q->subs[i].size);
+            if (!newbuf)
+                continue;
+
+            charenc_buf = newbuf;
+
+            memcpy(charenc_buf + charenc_buf_len, q->subs[i].data,
+                   q->subs[i].size);
+            charenc_buf_len += q->subs[i].size;
+        }
+    }
+
+    if (detection) {
+#if CONFIG_LIBGUESS
+        if (avctx->libguess_language) {
+            const char *enc =
+                libguess_determine_encoding(charenc_buf,
+                                            charenc_buf_len,
+                                            avctx->libguess_language);
+            av_log(avctx, AV_LOG_INFO, "libguess selected: %s\n", enc);
+            add_charenc(avctx, enc);
+        }
+#endif
+#if CONFIG_ENCA
+        if (avctx->enca_language) {
+            EncaAnalyser an = enca_analyser_alloc(avctx->enca_language);
+            if (an) {
+                EncaEncoding enc;
+                const char *str;
+                enca_set_threshold(an, avctx->enca_threshold);
+                enca_set_multibyte(an, avctx->enca_multibyte);
+                enca_set_ambiguity(an, avctx->enca_ambiguity);
+                enca_set_garbage_test(an, 1);
+
+                enc = enca_analyse_const(an, charenc_buf, charenc_buf_len);
+
+                str = enca_charset_name(enc.charset, ENCA_NAME_STYLE_ICONV);
+                av_log(avctx, AV_LOG_INFO, "ENCA selected: %s\n", str);
+                if (enca_charset_is_known(enc.charset))
+                    add_charenc(avctx, str);
+
+                enca_analyser_free(an);
+            } else {
+                av_log(avctx, AV_LOG_ERROR, "ENCA allocation failed\n");
+            }
+        }
+#endif
+#if CONFIG_UCHARDET
+        {
+            uchardet_t det = uchardet_new();
+            if (det) {
+                const char *enc;
+                uchardet_handle_data(det, charenc_buf, charenc_buf_len);
+                uchardet_data_end(det);
+                enc = uchardet_get_charset(det);
+                av_log(avctx, AV_LOG_INFO, "uchardet selected: %s\n", enc);
+                add_charenc(avctx, enc);
+                uchardet_delete(det);
+            }
+        }
+#endif
+
+        av_freep(&avctx->sub_charenc);
+
+        if (avctx->nb_sub_charenc_guesses)
+            avctx->sub_charenc = av_strdup(avctx->sub_charenc_guesses[0]);
+    }
+}
+
+#define UTF8_MAX_BYTES 4 /* 5 and 6 bytes sequences should not be used */
+/**
+ * Convert an AVPacket from one character encoding to another, using the
+ * selected encoding from an AVFormatContext and falling back on other encoding
+ * guesses if necessary.
+ *
+ * @param avctx  the AVFormatContext whose character encodings we'll use
+ * @param outpkt the AVPacket to write to
+ * @param inpkt  the AVPacket to read from
+ *
+ * This first tries the AVFormatContext's sub_charenc, then falls back on its
+ * sub_charenc_guesses. If none decodes successfully, it tries sub_charenc
+ * again, but instructs iconv to keep chugging on illegal sequences.
+ * If the packet is successfully recoded with an encoding other than the
+ * sub_charenc, then sub_charenc is changed to the working encoding.
+ */
+static int recode_subtitle(AVFormatContext *avctx,
+                           AVPacket *outpkt, const AVPacket *inpkt)
+{
+#if CONFIG_ICONV
+    iconv_t cd = (iconv_t)-1;
+    int ret = 0;
+    char *inb, *outb;
+    size_t inl, outl;
+    AVPacket tmp;
+    int i;
+#endif
+
+    // Set attributes on the output packet that aren't covered by
+    // av_copy_packet, like the pts and duration.
+    *outpkt = *inpkt;
+
+    if (av_copy_packet(outpkt, inpkt))
+        return AVERROR(ENOMEM);
+
+    if (!avctx || !avctx->sub_charenc || inpkt->size == 0)
+        return 0;
+
+#if CONFIG_ICONV
+    inb = inpkt->data;
+    inl = inpkt->size;
+
+    if (inl >= INT_MAX / UTF8_MAX_BYTES - FF_INPUT_BUFFER_PADDING_SIZE) {
+        av_log(avctx, AV_LOG_ERROR, "Subtitles packet is too big for recoding\n");
+        ret = AVERROR(ENOMEM);
+        goto end;
+    }
+
+    // Allocate a dummy packet that holds new buffers
+    ret = av_new_packet(&tmp, inl * UTF8_MAX_BYTES);
+    if (ret < 0)
+        goto end;
+
+    for (i = -1; i <= avctx->nb_sub_charenc_guesses; i++) {
+        const char *encoding;
+        // If this is our last attempt, skip illegal sequences.
+        int discard_illegal = (i == avctx->nb_sub_charenc_guesses);
+
+        // Reset our buffers and sizes every time, as iconv might change them.
+        outpkt->buf  = tmp.buf;
+        outpkt->data = tmp.data;
+        outpkt->size = tmp.size;
+        outb = outpkt->data;
+        outl = outpkt->size;
+
+        // The encoding we're going to try. We use sub_charenc first, then try
+        // our array of guesses, then try sub_charenc again with illegal
+        // sequences enabled.
+        if (i == -1 || i == avctx->nb_sub_charenc_guesses)
+            encoding = avctx->sub_charenc;
+        else
+            encoding = avctx->sub_charenc_guesses[i];
+
+        cd = iconv_open("UTF-8", encoding);
+        if (cd == (iconv_t)-1) {
+            av_log(avctx, AV_LOG_WARNING, "Invalid character encoding: %s\n",
+                   encoding);
+            ret = AVERROR(EINVAL);
+            continue;
+        }
+
+        iconvctl(cd, ICONV_SET_DISCARD_ILSEQ, &discard_illegal);
+
+        // Try to run a conversion.
+        if (iconv(cd, &inb, &inl, &outb, &outl) != (size_t)-1 &&
+            iconv(cd, NULL, NULL, &outb, &outl) != (size_t)-1 &&
+            outl < outpkt->size && inl == 0) {
+            // Success, save the new encoding and get out.
+            if (discard_illegal) {
+                av_log(avctx, AV_LOG_WARNING, "Needed to discard illegal "
+                       "sequences while recoding subtitle event \"%s\" from %s "
+                       "to UTF-8\n", inpkt->data, avctx->sub_charenc);
+            } else if (i >= 0) {
+                av_log(avctx, AV_LOG_INFO, "Switching character encoding from "
+                       "from %s to %s\n", avctx->sub_charenc, encoding);
+                av_freep(&avctx->sub_charenc);
+                avctx->sub_charenc = av_strdup(encoding);
+            }
+
+            // Remove and zero extra buffer space that iconv didn't end up using
+            outpkt->size -= outl;
+            memset(outpkt->data + outpkt->size, 0, outl);
+            iconv_close(cd);
+            return 0;
+        }
+
+        ret = FFMIN(AVERROR(errno), -1);
+        iconv_close(cd);
+    }
+
+    av_log(avctx, AV_LOG_ERROR, "Unable to recode subtitle event \"%s\" "
+           "from %s to UTF-8\n", inpkt->data, avctx->sub_charenc);
+    av_free_packet(&tmp);
+
+end:
+    if (cd != (iconv_t)-1)
+        iconv_close(cd);
+    return ret;
+#else
+    av_log(avctx, AV_LOG_ERROR, "requesting subtitles recoding without iconv");
+    return AVERROR(EINVAL);
+#endif
 }
 
 int ff_subtitles_queue_read_packet(FFDemuxSubtitlesQueue *q, AVPacket *pkt)
 {
     AVPacket *sub = q->subs + q->current_sub_idx;
+    int ret;
 
     if (q->current_sub_idx == q->nb_subs)
         return AVERROR_EOF;
-    if (av_copy_packet(pkt, sub) < 0) {
-        return AVERROR(ENOMEM);
+    if ((ret = recode_subtitle(q->avctx, pkt, sub)) < 0) {
+        return ret;
     }
 
     pkt->dts = pkt->pts;
diff --git a/libavformat/subtitles.h b/libavformat/subtitles.h
index eb719ea..69ced11 100644
--- a/libavformat/subtitles.h
+++ b/libavformat/subtitles.h
@@ -100,6 +100,7 @@ int ff_text_peek_r8(FFTextReader *r);
 void ff_text_read(FFTextReader *r, char *buf, size_t size);
 
 typedef struct {
+    AVFormatContext *avctx; ///< AVFormat context; used for charenc parameters
     AVPacket *subs;         ///< array of subtitles packets
     int nb_subs;            ///< number of subtitles packets
     int allocated_size;     ///< allocated size for subs
diff --git a/libavformat/subviewer1dec.c b/libavformat/subviewer1dec.c
index 6b38533..35303ce 100644
--- a/libavformat/subviewer1dec.c
+++ b/libavformat/subviewer1dec.c
@@ -47,6 +47,8 @@ static int subviewer1_read_header(AVFormatContext *s)
     SubViewer1Context *subviewer1 = s->priv_data;
     AVStream *st = avformat_new_stream(s, NULL);
 
+    subviewer1->q.avctx = s;
+
     if (!st)
         return AVERROR(ENOMEM);
     avpriv_set_pts_info(st, 64, 1, 1);
diff --git a/libavformat/subviewerdec.c b/libavformat/subviewerdec.c
index f1b0fdf..1197a0c 100644
--- a/libavformat/subviewerdec.c
+++ b/libavformat/subviewerdec.c
@@ -76,6 +76,8 @@ static int subviewer_read_header(AVFormatContext *s)
     int duration = -1;
     AVPacket *sub = NULL;
 
+    subviewer->q.avctx = s;
+
     if (!st)
         return AVERROR(ENOMEM);
     avpriv_set_pts_info(st, 64, 1, 100);
diff --git a/libavformat/utils.c b/libavformat/utils.c
index 5a2a72d..052b2fc 100644
--- a/libavformat/utils.c
+++ b/libavformat/utils.c
@@ -3597,6 +3597,8 @@ void avformat_free_context(AVFormatContext *s)
     av_dict_free(&s->metadata);
     av_freep(&s->streams);
     av_freep(&s->internal);
+    while (s->nb_sub_charenc_guesses--)
+        av_freep(&s->sub_charenc_guesses[s->nb_sub_charenc_guesses]);
     flush_packet_queue(s);
     av_free(s);
 }
diff --git a/libavformat/vplayerdec.c b/libavformat/vplayerdec.c
index 619ccfd..7cd3363 100644
--- a/libavformat/vplayerdec.c
+++ b/libavformat/vplayerdec.c
@@ -59,6 +59,8 @@ static int vplayer_read_header(AVFormatContext *s)
     VPlayerContext *vplayer = s->priv_data;
     AVStream *st = avformat_new_stream(s, NULL);
 
+    vplayer->q.avctx = s;
+
     if (!st)
         return AVERROR(ENOMEM);
     avpriv_set_pts_info(st, 64, 1, 100);
diff --git a/libavformat/webvttdec.c b/libavformat/webvttdec.c
index e457e8f..4d82cca 100644
--- a/libavformat/webvttdec.c
+++ b/libavformat/webvttdec.c
@@ -64,6 +64,8 @@ static int webvtt_read_header(AVFormatContext *s)
     int res = 0;
     AVStream *st = avformat_new_stream(s, NULL);
 
+    webvtt->q.avctx = s;
+
     if (!st)
         return AVERROR(ENOMEM);
     avpriv_set_pts_info(st, 64, 1, 1000);
-- 
1.9.1



More information about the ffmpeg-devel mailing list