[FFmpeg-devel] [PATCH 1/2] avformat/matroskadec: add support for IETF BCP47 locale tag

Marth64 marth64 at proxyid.net
Wed Nov 20 09:20:43 EET 2024


Matroska EBML allows for an element LanguageBCP47 per stream, which
contains an IETF BCP47 locale code that can declare the language,
script, and region according to the RFC5646. For example, "en-US"
indicates English (United States) form. The specification also
declares that if the LanguageBCP47 element is present, it shall
override any set Language element.

There are Matroska muxers which support setting this field.
This commit implements support for reading LanguageBCP47 and
applying it to stream level tags. The language component
of the parsed element is assigned to the common "language" tag,
while the entire locale itself is assigned to a "locale" tag.

Note that there are valid cases where the language component is
undetermined, but the regional component is set (e.g. und-US).

Signed-off-by: Marth64 <marth64 at proxyid.net>
---
 libavformat/matroska.h    |  1 +
 libavformat/matroskadec.c | 45 +++++++++++++++++++++++++++++++++++++--
 2 files changed, 44 insertions(+), 2 deletions(-)

diff --git a/libavformat/matroska.h b/libavformat/matroska.h
index 719f2ef796..aa0b7a5df8 100644
--- a/libavformat/matroska.h
+++ b/libavformat/matroska.h
@@ -94,6 +94,7 @@
 #define MATROSKA_ID_SEEKPREROLL 0x56BB
 #define MATROSKA_ID_TRACKNAME  0x536E
 #define MATROSKA_ID_TRACKLANGUAGE 0x22B59C
+#define MATROSKA_ID_TRACKLANGUAGEBCP47 0x22B59D
 #define MATROSKA_ID_TRACKFLAGENABLED 0xB9
 #define MATROSKA_ID_TRACKFLAGDEFAULT 0x88
 #define MATROSKA_ID_TRACKFLAGFORCED 0x55AA
diff --git a/libavformat/matroskadec.c b/libavformat/matroskadec.c
index a973b62756..7666e2cb28 100644
--- a/libavformat/matroskadec.c
+++ b/libavformat/matroskadec.c
@@ -60,6 +60,7 @@
 #include "libavcodec/packet_internal.h"
 
 #include "avformat.h"
+#include "avlanguage.h"
 #include "avio_internal.h"
 #include "demux.h"
 #include "dovi_isom.h"
@@ -270,6 +271,7 @@ typedef struct MatroskaTrack {
     char    *codec_id;
     EbmlBin  codec_priv;
     char    *language;
+    char    *locale;
     double time_scale;
     uint64_t default_duration;
     uint64_t flag_default;
@@ -448,7 +450,7 @@ typedef struct MatroskaDemuxContext {
 // incomplete type (6.7.2 in C90, 6.9.2 in C99).
 // Removing the sizes breaks MSVC.
 static EbmlSyntax ebml_syntax[3], matroska_segment[9], matroska_track_video_color[15], matroska_track_video[19],
-                  matroska_track[33], matroska_track_encoding[6], matroska_track_encodings[2],
+                  matroska_track[34], matroska_track_encoding[6], matroska_track_encodings[2],
                   matroska_track_combine_planes[2], matroska_track_operation[2], matroska_block_addition_mapping[5], matroska_tracks[2],
                   matroska_attachments[2], matroska_chapter_entry[9], matroska_chapter[6], matroska_chapters[2],
                   matroska_index_entry[3], matroska_index[2], matroska_tag[3], matroska_tags[2], matroska_seekhead[2],
@@ -616,6 +618,7 @@ static EbmlSyntax matroska_track[] = {
     { MATROSKA_ID_CODECPRIVATE,          EBML_BIN,   0, 0, offsetof(MatroskaTrack, codec_priv) },
     { MATROSKA_ID_CODECDELAY,            EBML_UINT,  0, 0, offsetof(MatroskaTrack, codec_delay),  { .u = 0 } },
     { MATROSKA_ID_TRACKLANGUAGE,         EBML_STR,   0, 0, offsetof(MatroskaTrack, language),     { .s = "eng" } },
+    { MATROSKA_ID_TRACKLANGUAGEBCP47,    EBML_STR,   0, 0, offsetof(MatroskaTrack, locale) },
     { MATROSKA_ID_TRACKDEFAULTDURATION,  EBML_UINT,  0, 0, offsetof(MatroskaTrack, default_duration) },
     { MATROSKA_ID_TRACKTIMECODESCALE,    EBML_FLOAT, 0, 0, offsetof(MatroskaTrack, time_scale),   { .f = 1.0 } },
     { MATROSKA_ID_TRACKFLAGCOMMENTARY,   EBML_UINT,  0, 0, offsetof(MatroskaTrack, flag_comment), { .u = 0 } },
@@ -3054,6 +3057,19 @@ static int mkv_parse_video(MatroskaTrack *track, AVStream *st,
     return 0;
 }
 
+/* Validate an IETF BCP47 component */
+static int mkv_validate_bcp47_part(const char *s)
+{
+    if (strlen(s) > 8)
+        return 0;
+
+    while (*s && ((*s >= 'a' && *s <= 'z')  ||
+                  (*s >= 'A' && *s <= 'Z')  ||
+                  (*s >= '0' && *s <= '9')))
+        s++;
+    return !*s;
+}
+
 /* Performs the codec-specific part of parsing a subtitle track. */
 static int mkv_parse_subtitle_codec(MatroskaTrack *track, AVStream *st,
                                     AVCodecParameters *par,
@@ -3223,8 +3239,33 @@ static int matroska_parse_tracks(AVFormatContext *s)
                         AV_DICT_DONT_STRDUP_VAL);
         }
 
-        if (strcmp(track->language, "und"))
+        if (track->locale) {
+            const char *locale_language;
+            char *locale_tmp = av_strdup(track->locale);
+            char *locale_part, *locale_saveptr;
+            int locale_partn = 0;
+
+            locale_part = av_strtok(locale_tmp, "-", &locale_saveptr);
+            while (locale_part) {
+                if (!mkv_validate_bcp47_part(locale_part))
+                    break;
+
+                if (locale_partn == 0) {
+                    locale_language = ff_convert_lang_to(locale_part, AV_LANG_ISO639_2_BIBL);
+
+                    if (strcmp(locale_language, "und"))
+                        av_dict_set(&st->metadata, "language", locale_language, 0);
+                }
+
+                locale_part = av_strtok(NULL, "-", &locale_saveptr);
+                locale_partn++;
+            }
+
+            if (locale_partn > 1)
+                av_dict_set(&st->metadata, "locale", track->locale, 0);
+        } else if (strcmp(track->language, "und")) {
             av_dict_set(&st->metadata, "language", track->language, 0);
+        }
         av_dict_set(&st->metadata, "title", track->name, 0);
 
         if (track->time_scale < 0.01) {
-- 
2.34.1



More information about the ffmpeg-devel mailing list