[FFmpeg-devel] [PATCH 1/3] lavu: add av_is_valid_utf8().

Nicolas George nicolas.george at normalesup.org
Sun Apr 7 10:23:44 CEST 2013


Signed-off-by: Nicolas George <nicolas.george at normalesup.org>
---
 doc/APIchanges          |    3 ++
 libavutil/avstring.c    |   75 +++++++++++++++++++++++++++++++++++++++++++++++
 libavutil/avstring.h    |   23 +++++++++++++++
 libavutil/version.h     |    2 +-
 tests/ref/fate/avstring |   62 +++++++++++++++++++++++++++++++++++++++
 5 files changed, 164 insertions(+), 1 deletion(-)


Note: I did not find a robust way of using the existing GET_UTF8 macro while
avoiding overreads.


diff --git a/doc/APIchanges b/doc/APIchanges
index 01f7825..8e33dbb 100644
--- a/doc/APIchanges
+++ b/doc/APIchanges
@@ -15,6 +15,9 @@ libavutil:     2012-10-22
 
 API changes, most recent first:
 
+2013-04-07 - xxxxxxx - lavu 52.26.100 - avstring.h
+  Add av_is_valid_utf8().
+
 2013-03-30 - xxxxxxx - lavu 52.24.100 - samplefmt.h
   Add av_samples_alloc_array_and_samples().
 
diff --git a/libavutil/avstring.c b/libavutil/avstring.c
index cf9be2a..8b433fc 100644
--- a/libavutil/avstring.c
+++ b/libavutil/avstring.c
@@ -25,6 +25,7 @@
 #include <string.h>
 
 #include "config.h"
+#include "avassert.h"
 #include "common.h"
 #include "mem.h"
 #include "avstring.h"
@@ -307,6 +308,51 @@ int av_isxdigit(int c)
     return av_isdigit(c) || (c >= 'a' && c <= 'f');
 }
 
+int av_is_valid_utf8(const uint8_t *str, size_t len, size_t *rend,
+                     unsigned flags)
+{
+    size_t chr = 0, byte;
+    uint32_t codepoint;
+    unsigned lchr, i, min_bits;
+
+    if (len == (size_t)-1)
+        len = strlen(str);
+
+    while (chr < len) {
+        byte = chr;
+        if (str[byte] & 0x80) {
+            if (!(str[byte] & 0x40) || str[byte] >= 0xFE)
+                goto fail;
+            lchr = 7 - av_log2(str[byte] ^ 0xFF);
+            if (lchr > len - byte)
+                goto fail;
+            codepoint = str[byte++] & (0xFF >> lchr);
+            for (i = 1; i < lchr; i++) {
+                if ((str[byte] & 0xC0) != 0x80)
+                    goto fail;
+                codepoint = (codepoint << 6) | (str[byte++] & 0x3F);
+            }
+            av_assert0(lchr == byte - chr);
+            min_bits = lchr == 2 ? 7 : 5 * lchr - 4;
+#define REJECT(f) (!(flags & AV_UTF8_ACCEPT_ ## f))
+            if ((REJECT(OVERLONG) && codepoint <= 1 << min_bits) ||
+                (REJECT(HIGH_PLANES) && codepoint >= 0x110000) ||
+                (REJECT(SURROGATES) &&
+                 codepoint >= 0xD800 && codepoint <= 0xDFFF) ||
+                (REJECT(REVERSED_BOM) && codepoint == 0xFFFE))
+                goto fail;
+#undef REJECT
+            chr = byte;
+        } else {
+            chr++;
+        }
+    }
+fail:
+    if (rend)
+        *rend = chr;
+    return chr == len;
+}
+
 #ifdef TEST
 
 int main(void)
@@ -352,6 +398,35 @@ int main(void)
         av_free(q);
     }
 
+    {
+        const char *strings[] = {
+            "valid:  A \xC3\x97 \xE2\x88\x80 \xF0\x9D\x9F\x98",
+            "continuation: \xC9""hec",
+            "orphan: \xE9""hec",
+            "BOM: \xEF\xBB\xBF",
+            "reversed_BOM: \xEF\xBF\xBE",
+            "truncated: \xE2",
+            "FE: \xFE",
+            "FF: \xFF",
+            "overlong: \xF0\x82\x82\xAC",
+            "5bytes: \xF8\x8D\x85\x99\xB8",
+            "6bytes: \xFC\x92\x8D\x85\x99\xB8",
+            "surrogate: \xED\xB1\x82",
+        };
+        int flags, ret;
+        size_t end;
+
+        printf("\nTesting av_is_valid_utf8()\n");
+        for (i = 0; i < FF_ARRAY_ELEMS(strings); i++) {
+            for (flags = 0; flags < 16; flags = !flags + (flags << 1)) {
+                ret = av_is_valid_utf8(strings[i], -1, &end, flags);
+                printf("%02x: %.*s: %svalid, end at %d / %d\n", flags,
+                       (int)strcspn(strings[i], ":"), strings[i],
+                       ret ? "" : "in", (int)end, (int)strlen(strings[i]));
+            }
+        }
+    }
+
     return 0;
 }
 
diff --git a/libavutil/avstring.h b/libavutil/avstring.h
index 438ef79..2e2fbec 100644
--- a/libavutil/avstring.h
+++ b/libavutil/avstring.h
@@ -22,6 +22,7 @@
 #define AVUTIL_AVSTRING_H
 
 #include <stddef.h>
+#include <stdint.h>
 #include "attributes.h"
 
 /**
@@ -296,6 +297,28 @@ int av_escape(char **dst, const char *src, const char *special_chars,
               enum AVEscapeMode mode, int flags);
 
 /**
+ * Test if a string is valid UTF-8.
+ * @param[in]  str    pointer to the string to examine
+ * @param[in]  len    length of the string, in bytes;
+ *                    (size_t)-1 if it is 0-terminated
+ * @param[out] rend   if not NULL, will be set to the size (in bytes) of the
+ *                    longest prefix that is valid UTF-8
+ * @param[in]  flags  control details of the process; see AV_UTF8_* below
+ * @return  1 if the string is valid UTF-8, 0 if not
+ * @note  This function performs tests that are usually skipped by normal
+ * string processing functions.
+ */
+int av_is_valid_utf8(const uint8_t *str, size_t len, size_t *rend,
+                     unsigned flags);
+
+enum {
+    AV_UTF8_ACCEPT_OVERLONG        = 0x01,
+    AV_UTF8_ACCEPT_HIGH_PLANES     = 0x02,
+    AV_UTF8_ACCEPT_SURROGATES      = 0x04,
+    AV_UTF8_ACCEPT_REVERSED_BOM    = 0x08,
+};
+
+/**
  * @}
  */
 
diff --git a/libavutil/version.h b/libavutil/version.h
index 7d1ab9c..6531397 100644
--- a/libavutil/version.h
+++ b/libavutil/version.h
@@ -75,7 +75,7 @@
  */
 
 #define LIBAVUTIL_VERSION_MAJOR  52
-#define LIBAVUTIL_VERSION_MINOR  25
+#define LIBAVUTIL_VERSION_MINOR  26
 #define LIBAVUTIL_VERSION_MICRO 100
 
 #define LIBAVUTIL_VERSION_INT   AV_VERSION_INT(LIBAVUTIL_VERSION_MAJOR, \
diff --git a/tests/ref/fate/avstring b/tests/ref/fate/avstring
index bc231e8..4166192 100644
--- a/tests/ref/fate/avstring
+++ b/tests/ref/fate/avstring
@@ -25,3 +25,65 @@ Testing av_get_token()
 |'foo : \ \  '   : blahblah| -> |foo : \ \  | + |: blahblah|
 |'\fo\o:': blahblah| -> |\fo\o:| + |: blahblah|
 |\'fo\o\:':  foo  '  :blahblah| -> |'foo::  foo  | + |:blahblah|
+
+Testing av_is_valid_utf8()
+00: valid: valid, end at 21 / 21
+01: valid: valid, end at 21 / 21
+02: valid: valid, end at 21 / 21
+04: valid: valid, end at 21 / 21
+08: valid: valid, end at 21 / 21
+00: continuation: invalid, end at 14 / 18
+01: continuation: invalid, end at 14 / 18
+02: continuation: invalid, end at 14 / 18
+04: continuation: invalid, end at 14 / 18
+08: continuation: invalid, end at 14 / 18
+00: orphan: invalid, end at 8 / 12
+01: orphan: invalid, end at 8 / 12
+02: orphan: invalid, end at 8 / 12
+04: orphan: invalid, end at 8 / 12
+08: orphan: invalid, end at 8 / 12
+00: BOM: valid, end at 8 / 8
+01: BOM: valid, end at 8 / 8
+02: BOM: valid, end at 8 / 8
+04: BOM: valid, end at 8 / 8
+08: BOM: valid, end at 8 / 8
+00: reversed_BOM: invalid, end at 14 / 17
+01: reversed_BOM: invalid, end at 14 / 17
+02: reversed_BOM: invalid, end at 14 / 17
+04: reversed_BOM: invalid, end at 14 / 17
+08: reversed_BOM: valid, end at 17 / 17
+00: truncated: invalid, end at 11 / 12
+01: truncated: invalid, end at 11 / 12
+02: truncated: invalid, end at 11 / 12
+04: truncated: invalid, end at 11 / 12
+08: truncated: invalid, end at 11 / 12
+00: FE: invalid, end at 4 / 5
+01: FE: invalid, end at 4 / 5
+02: FE: invalid, end at 4 / 5
+04: FE: invalid, end at 4 / 5
+08: FE: invalid, end at 4 / 5
+00: FF: invalid, end at 4 / 5
+01: FF: invalid, end at 4 / 5
+02: FF: invalid, end at 4 / 5
+04: FF: invalid, end at 4 / 5
+08: FF: invalid, end at 4 / 5
+00: overlong: invalid, end at 10 / 14
+01: overlong: valid, end at 14 / 14
+02: overlong: invalid, end at 10 / 14
+04: overlong: invalid, end at 10 / 14
+08: overlong: invalid, end at 10 / 14
+00: 5bytes: invalid, end at 8 / 13
+01: 5bytes: invalid, end at 8 / 13
+02: 5bytes: valid, end at 13 / 13
+04: 5bytes: invalid, end at 8 / 13
+08: 5bytes: invalid, end at 8 / 13
+00: 6bytes: invalid, end at 8 / 14
+01: 6bytes: invalid, end at 8 / 14
+02: 6bytes: valid, end at 14 / 14
+04: 6bytes: invalid, end at 8 / 14
+08: 6bytes: invalid, end at 8 / 14
+00: surrogate: invalid, end at 11 / 14
+01: surrogate: invalid, end at 11 / 14
+02: surrogate: invalid, end at 11 / 14
+04: surrogate: valid, end at 14 / 14
+08: surrogate: invalid, end at 11 / 14
-- 
1.7.10.4



More information about the ffmpeg-devel mailing list