[FFmpeg-devel] [PATCH v2 1/6] avformat/assdec: UTF-16 support

wm4 nfxjfg at googlemail.com
Thu Sep 4 22:40:22 CEST 2014


Use the UTF-16 BOM to detect UTF-16 encoding. Convert the file contents
to UTF-8 on the fly using FFTextReader, which acts as converting wrapper
around AVIOContext. It also can work on a static buffer, needed for
format probing. The FFTextReader wrapper now also takes care of skipping
the UTF-8 BOM.
---
Now uses an enum for UTF types.
---
 libavformat/assdec.c    | 19 +++++++++------
 libavformat/subtitles.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++
 libavformat/subtitles.h | 57 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 132 insertions(+), 7 deletions(-)

diff --git a/libavformat/assdec.c b/libavformat/assdec.c
index bb953c7..a5f792a 100644
--- a/libavformat/assdec.c
+++ b/libavformat/assdec.c
@@ -33,10 +33,13 @@ typedef struct ASSContext {
 
 static int ass_probe(AVProbeData *p)
 {
-    const char *header = "[Script Info]";
+    char buf[13];
+    FFTextReader tr;
+    ff_text_init_buf(&tr, p->buf, p->buf_size);
 
-    if (!memcmp(p->buf, header, strlen(header)) ||
-        !memcmp(p->buf + 3, header, strlen(header)))
+    ff_text_read(&tr, buf, sizeof(buf));
+
+    if (!memcmp(buf, "[Script Info]", 13))
         return AVPROBE_SCORE_MAX;
 
     return 0;
@@ -66,13 +69,13 @@ static int read_ts(const uint8_t *p, int64_t *start, int *duration)
     return -1;
 }
 
-static int64_t get_line(AVBPrint *buf, AVIOContext *pb)
+static int64_t get_line(AVBPrint *buf, FFTextReader *tr)
 {
-    int64_t pos = avio_tell(pb);
+    int64_t pos = ff_text_pos(tr);
 
     av_bprint_clear(buf);
     for (;;) {
-        char c = avio_r8(pb);
+        char c = ff_text_r8(tr);
         if (!c)
             break;
         av_bprint_chars(buf, c, 1);
@@ -88,6 +91,8 @@ static int ass_read_header(AVFormatContext *s)
     AVBPrint header, line;
     int header_remaining, res = 0;
     AVStream *st;
+    FFTextReader tr;
+    ff_text_init_avio(&tr, s->pb);
 
     st = avformat_new_stream(s, NULL);
     if (!st)
@@ -102,7 +107,7 @@ static int ass_read_header(AVFormatContext *s)
     av_bprint_init(&line,   0, AV_BPRINT_SIZE_UNLIMITED);
 
     for (;;) {
-        int64_t pos = get_line(&line, s->pb);
+        int64_t pos = get_line(&line, &tr);
 
         if (!line.str[0]) // EOF
             break;
diff --git a/libavformat/subtitles.c b/libavformat/subtitles.c
index fce2bf1..cebd453 100644
--- a/libavformat/subtitles.c
+++ b/libavformat/subtitles.c
@@ -20,9 +20,72 @@
 
 #include "avformat.h"
 #include "subtitles.h"
+#include "avio_internal.h"
 #include "libavutil/avassert.h"
 #include "libavutil/avstring.h"
 
+void ff_text_init_avio(FFTextReader *r, AVIOContext *pb)
+{
+    int i;
+    r->pb = pb;
+    r->buf_pos = r->buf_len = 0;
+    r->type = FF_UTF_8;
+    for (i = 0; i < 2; i++)
+        r->buf[r->buf_len++] = avio_r8(r->pb);
+    if (strncmp("\xFF\xFE", r->buf, 2) == 0) {
+        r->type = FF_UTF16LE;
+        r->buf_pos += 2;
+    } else if (strncmp("\xFE\xFF", r->buf, 2) == 0) {
+        r->type = FF_UTF16BE;
+        r->buf_pos += 2;
+    } else {
+        r->buf[r->buf_len++] = avio_r8(r->pb);
+        if (strncmp("\xEF\xBB\xBF", r->buf, 3) == 0) {
+            // UTF8
+            r->buf_pos += 3;
+        }
+    }
+}
+
+void ff_text_init_buf(FFTextReader *r, void *buf, size_t size)
+{
+    memset(&r->buf_pb, 0, sizeof(r->buf_pb));
+    ffio_init_context(&r->buf_pb, buf, size, 0, NULL, NULL, NULL, NULL);
+    ff_text_init_avio(r, &r->buf_pb);
+}
+
+int64_t ff_text_pos(FFTextReader *r)
+{
+    return avio_tell(r->pb) - r->buf_len + r->buf_pos;
+}
+
+int ff_text_r8(FFTextReader *r)
+{
+    uint32_t val;
+    uint8_t tmp;
+    if (r->buf_pos < r->buf_len)
+        return r->buf[r->buf_pos++];
+    if (r->type == FF_UTF16LE) {
+        GET_UTF16(val, avio_rl16(r->pb), return 0;)
+    } else if (r->type == FF_UTF16BE) {
+        GET_UTF16(val, avio_rb16(r->pb), return 0;)
+    } else {
+        return avio_r8(r->pb);
+    }
+    if (!val)
+        return 0;
+    r->buf_pos = 0;
+    r->buf_len = 0;
+    PUT_UTF8(val, tmp, r->buf[r->buf_len++] = tmp;)
+    return r->buf[r->buf_pos++]; // buf_len is at least 1
+}
+
+void ff_text_read(FFTextReader *r, char *buf, size_t size)
+{
+    for ( ; size > 0; size--)
+        *buf++ = ff_text_r8(r);
+}
+
 AVPacket *ff_subtitles_queue_insert(FFDemuxSubtitlesQueue *q,
                                     const uint8_t *event, int len, int merge)
 {
diff --git a/libavformat/subtitles.h b/libavformat/subtitles.h
index b5a96ec..317e40a 100644
--- a/libavformat/subtitles.h
+++ b/libavformat/subtitles.h
@@ -30,6 +30,63 @@ enum sub_sort {
     SUB_SORT_POS_TS,        ///< sort by position, then timestamps
 };
 
+enum ff_utf_type {
+    FF_UTF_8,       // or other 8 bit encodings
+    FF_UTF16LE,
+    FF_UTF16BE,
+};
+
+typedef struct {
+    int type;
+    AVIOContext *pb;
+    unsigned char buf[8];
+    int buf_pos, buf_len;
+    AVIOContext buf_pb;
+} FFTextReader;
+
+/**
+ * Initialize the FFTextReader from the given AVIOContext. This function will
+ * read some bytes from pb, and test for UTF-8 or UTF-16 BOMs. Further accesses
+ * to FFTextReader will read more data from pb.
+ *
+ * The purpose of FFTextReader is to transparently convert read data to UTF-8
+ * if the stream had a UTF-16 BOM.
+ *
+ * @param r object which will be initialized
+ * @param pb stream to read from (referenced as long as FFTextReader is in use)
+ */
+void ff_text_init_avio(FFTextReader *r, AVIOContext *pb);
+
+/**
+ * Similar to ff_text_init_avio(), but sets it up to read from a bounded buffer.
+ *
+ * @param r object which will be initialized
+ * @param buf buffer to read from (referenced as long as FFTextReader is in use)
+ * @param size size of buf
+ */
+void ff_text_init_buf(FFTextReader *r, void *buf, size_t size);
+
+/**
+ * Return the byte position of the next byte returned by ff_text_r8(). For
+ * UTF-16 source streams, this will return the original position, but it will
+ * be incorrect if a codepoint was only partially read with ff_text_r8().
+ */
+int64_t ff_text_pos(FFTextReader *r);
+
+/**
+ * Return the next byte. The return value is always 0 - 255. Returns 0 on EOF.
+ * If the source stream is UTF-16, this reads from the stream converted to
+ * UTF-8. On invalid UTF-18, 0 is returned.
+ *
+ */
+int ff_text_r8(FFTextReader *r);
+
+/**
+ * Read the given number of bytes (in UTF-8). On error or EOF, \0 bytes are
+ * written.
+ */
+void ff_text_read(FFTextReader *r, char *buf, size_t size);
+
 typedef struct {
     AVPacket *subs;         ///< array of subtitles packets
     int nb_subs;            ///< number of subtitles packets
-- 
2.1.0



More information about the ffmpeg-devel mailing list