[FFmpeg-devel] [PATCH 1/3] lavu: add av_is_valid_utf8().
Nicolas George
nicolas.george at normalesup.org
Sun Apr 7 10:23:44 CEST 2013
Signed-off-by: Nicolas George <nicolas.george at normalesup.org>
---
doc/APIchanges | 3 ++
libavutil/avstring.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++
libavutil/avstring.h | 23 +++++++++++++++
libavutil/version.h | 2 +-
tests/ref/fate/avstring | 62 +++++++++++++++++++++++++++++++++++++++
5 files changed, 164 insertions(+), 1 deletion(-)
Note: I did not find a robust way of using the existing GET_UTF8 macro while
avoiding overreads.
diff --git a/doc/APIchanges b/doc/APIchanges
index 01f7825..8e33dbb 100644
--- a/doc/APIchanges
+++ b/doc/APIchanges
@@ -15,6 +15,9 @@ libavutil: 2012-10-22
API changes, most recent first:
+2013-04-07 - xxxxxxx - lavu 52.26.100 - avstring.h
+ Add av_is_valid_utf8().
+
2013-03-30 - xxxxxxx - lavu 52.24.100 - samplefmt.h
Add av_samples_alloc_array_and_samples().
diff --git a/libavutil/avstring.c b/libavutil/avstring.c
index cf9be2a..8b433fc 100644
--- a/libavutil/avstring.c
+++ b/libavutil/avstring.c
@@ -25,6 +25,7 @@
#include <string.h>
#include "config.h"
+#include "avassert.h"
#include "common.h"
#include "mem.h"
#include "avstring.h"
@@ -307,6 +308,51 @@ int av_isxdigit(int c)
return av_isdigit(c) || (c >= 'a' && c <= 'f');
}
+int av_is_valid_utf8(const uint8_t *str, size_t len, size_t *rend,
+ unsigned flags)
+{
+ size_t chr = 0, byte;
+ uint32_t codepoint;
+ unsigned lchr, i, min_bits;
+
+ if (len == (size_t)-1)
+ len = strlen(str);
+
+ while (chr < len) {
+ byte = chr;
+ if (str[byte] & 0x80) {
+ if (!(str[byte] & 0x40) || str[byte] >= 0xFE)
+ goto fail;
+ lchr = 7 - av_log2(str[byte] ^ 0xFF);
+ if (lchr > len - byte)
+ goto fail;
+ codepoint = str[byte++] & (0xFF >> lchr);
+ for (i = 1; i < lchr; i++) {
+ if ((str[byte] & 0xC0) != 0x80)
+ goto fail;
+ codepoint = (codepoint << 6) | (str[byte++] & 0x3F);
+ }
+ av_assert0(lchr == byte - chr);
+ min_bits = lchr == 2 ? 7 : 5 * lchr - 4;
+#define REJECT(f) (!(flags & AV_UTF8_ACCEPT_ ## f))
+ if ((REJECT(OVERLONG) && codepoint <= 1 << min_bits) ||
+ (REJECT(HIGH_PLANES) && codepoint >= 0x110000) ||
+ (REJECT(SURROGATES) &&
+ codepoint >= 0xD800 && codepoint <= 0xDFFF) ||
+ (REJECT(REVERSED_BOM) && codepoint == 0xFFFE))
+ goto fail;
+#undef REJECT
+ chr = byte;
+ } else {
+ chr++;
+ }
+ }
+fail:
+ if (rend)
+ *rend = chr;
+ return chr == len;
+}
+
#ifdef TEST
int main(void)
@@ -352,6 +398,35 @@ int main(void)
av_free(q);
}
+ {
+ const char *strings[] = {
+ "valid: A \xC3\x97 \xE2\x88\x80 \xF0\x9D\x9F\x98",
+ "continuation: \xC9""hec",
+ "orphan: \xE9""hec",
+ "BOM: \xEF\xBB\xBF",
+ "reversed_BOM: \xEF\xBF\xBE",
+ "truncated: \xE2",
+ "FE: \xFE",
+ "FF: \xFF",
+ "overlong: \xF0\x82\x82\xAC",
+ "5bytes: \xF8\x8D\x85\x99\xB8",
+ "6bytes: \xFC\x92\x8D\x85\x99\xB8",
+ "surrogate: \xED\xB1\x82",
+ };
+ int flags, ret;
+ size_t end;
+
+ printf("\nTesting av_is_valid_utf8()\n");
+ for (i = 0; i < FF_ARRAY_ELEMS(strings); i++) {
+ for (flags = 0; flags < 16; flags = !flags + (flags << 1)) {
+ ret = av_is_valid_utf8(strings[i], -1, &end, flags);
+ printf("%02x: %.*s: %svalid, end at %d / %d\n", flags,
+ (int)strcspn(strings[i], ":"), strings[i],
+ ret ? "" : "in", (int)end, (int)strlen(strings[i]));
+ }
+ }
+ }
+
return 0;
}
diff --git a/libavutil/avstring.h b/libavutil/avstring.h
index 438ef79..2e2fbec 100644
--- a/libavutil/avstring.h
+++ b/libavutil/avstring.h
@@ -22,6 +22,7 @@
#define AVUTIL_AVSTRING_H
#include <stddef.h>
+#include <stdint.h>
#include "attributes.h"
/**
@@ -296,6 +297,28 @@ int av_escape(char **dst, const char *src, const char *special_chars,
enum AVEscapeMode mode, int flags);
/**
+ * Test if a string is valid UTF-8.
+ * @param[in] str pointer to the string to examine
+ * @param[in] len length of the string, in bytes;
+ * (size_t)-1 if it is 0-terminated
+ * @param[out] rend if not NULL, will be set to the size (in bytes) of the
+ * longest prefix that is valid UTF-8
+ * @param[in] flags control details of the process; see AV_UTF8_* below
+ * @return 1 if the string is valid UTF-8, 0 if not
+ * @note This function performs tests that are usually skipped by normal
+ * string processing functions.
+ */
+int av_is_valid_utf8(const uint8_t *str, size_t len, size_t *rend,
+ unsigned flags);
+
+enum {
+ AV_UTF8_ACCEPT_OVERLONG = 0x01,
+ AV_UTF8_ACCEPT_HIGH_PLANES = 0x02,
+ AV_UTF8_ACCEPT_SURROGATES = 0x04,
+ AV_UTF8_ACCEPT_REVERSED_BOM = 0x08,
+};
+
+/**
* @}
*/
diff --git a/libavutil/version.h b/libavutil/version.h
index 7d1ab9c..6531397 100644
--- a/libavutil/version.h
+++ b/libavutil/version.h
@@ -75,7 +75,7 @@
*/
#define LIBAVUTIL_VERSION_MAJOR 52
-#define LIBAVUTIL_VERSION_MINOR 25
+#define LIBAVUTIL_VERSION_MINOR 26
#define LIBAVUTIL_VERSION_MICRO 100
#define LIBAVUTIL_VERSION_INT AV_VERSION_INT(LIBAVUTIL_VERSION_MAJOR, \
diff --git a/tests/ref/fate/avstring b/tests/ref/fate/avstring
index bc231e8..4166192 100644
--- a/tests/ref/fate/avstring
+++ b/tests/ref/fate/avstring
@@ -25,3 +25,65 @@ Testing av_get_token()
|'foo : \ \ ' : blahblah| -> |foo : \ \ | + |: blahblah|
|'\fo\o:': blahblah| -> |\fo\o:| + |: blahblah|
|\'fo\o\:': foo ' :blahblah| -> |'foo:: foo | + |:blahblah|
+
+Testing av_is_valid_utf8()
+00: valid: valid, end at 21 / 21
+01: valid: valid, end at 21 / 21
+02: valid: valid, end at 21 / 21
+04: valid: valid, end at 21 / 21
+08: valid: valid, end at 21 / 21
+00: continuation: invalid, end at 14 / 18
+01: continuation: invalid, end at 14 / 18
+02: continuation: invalid, end at 14 / 18
+04: continuation: invalid, end at 14 / 18
+08: continuation: invalid, end at 14 / 18
+00: orphan: invalid, end at 8 / 12
+01: orphan: invalid, end at 8 / 12
+02: orphan: invalid, end at 8 / 12
+04: orphan: invalid, end at 8 / 12
+08: orphan: invalid, end at 8 / 12
+00: BOM: valid, end at 8 / 8
+01: BOM: valid, end at 8 / 8
+02: BOM: valid, end at 8 / 8
+04: BOM: valid, end at 8 / 8
+08: BOM: valid, end at 8 / 8
+00: reversed_BOM: invalid, end at 14 / 17
+01: reversed_BOM: invalid, end at 14 / 17
+02: reversed_BOM: invalid, end at 14 / 17
+04: reversed_BOM: invalid, end at 14 / 17
+08: reversed_BOM: valid, end at 17 / 17
+00: truncated: invalid, end at 11 / 12
+01: truncated: invalid, end at 11 / 12
+02: truncated: invalid, end at 11 / 12
+04: truncated: invalid, end at 11 / 12
+08: truncated: invalid, end at 11 / 12
+00: FE: invalid, end at 4 / 5
+01: FE: invalid, end at 4 / 5
+02: FE: invalid, end at 4 / 5
+04: FE: invalid, end at 4 / 5
+08: FE: invalid, end at 4 / 5
+00: FF: invalid, end at 4 / 5
+01: FF: invalid, end at 4 / 5
+02: FF: invalid, end at 4 / 5
+04: FF: invalid, end at 4 / 5
+08: FF: invalid, end at 4 / 5
+00: overlong: invalid, end at 10 / 14
+01: overlong: valid, end at 14 / 14
+02: overlong: invalid, end at 10 / 14
+04: overlong: invalid, end at 10 / 14
+08: overlong: invalid, end at 10 / 14
+00: 5bytes: invalid, end at 8 / 13
+01: 5bytes: invalid, end at 8 / 13
+02: 5bytes: valid, end at 13 / 13
+04: 5bytes: invalid, end at 8 / 13
+08: 5bytes: invalid, end at 8 / 13
+00: 6bytes: invalid, end at 8 / 14
+01: 6bytes: invalid, end at 8 / 14
+02: 6bytes: valid, end at 14 / 14
+04: 6bytes: invalid, end at 8 / 14
+08: 6bytes: invalid, end at 8 / 14
+00: surrogate: invalid, end at 11 / 14
+01: surrogate: invalid, end at 11 / 14
+02: surrogate: invalid, end at 11 / 14
+04: surrogate: valid, end at 14 / 14
+08: surrogate: invalid, end at 11 / 14
--
1.7.10.4
More information about the ffmpeg-devel
mailing list