[FFmpeg-devel] [PATCH v2 12/16] swscale/cms: add color management subsystem

Fri Dec 6 16:32:11 EET 2024

From: Niklas Haas <git at haasn.dev>

The underlying color mapping logic was ported as straightforwardly as possible
from libplacebo, although the API and glue code has been very heavily
refactored / rewritten. In particular, the generalization of gamut mapping
methods is replaced by a single ICC intent selection, and constants have been
hard-coded.

To minimize the amount of overall operations, this gamut mapping LUT now embeds
a direct end-to-end transformation to the output color space; something that
libplacebo does in shaders, but which is prohibitively expensive in software.

In order to preserve compatibility with dynamic tone mapping without severely
regressing performance, we add the ability to generate a pair of "split" LUTS,
one for encoding the input and output to the perceptual color space, and a
third to embed the tone mapping operation. Additionally, this intermediate
space could be used for additional subjective effect (e.g. changing
saturation or brightness).

The big downside of the new approach is that generating a static color mapping
LUT is now fairly slow, as the chromaticity lobe peaks have to be recomputed
for every single RGB value, since correlated RGB colors are not necessarily
aligned in ICh space. Generating a split 3DLUT significantly alleviates this
problem because the expensive step is done as part of the IPT input LUT, which
can share the same hue peak calculation at least for all input intensities.
---
 libswscale/Makefile |   1 +
 libswscale/cms.c    | 766 ++++++++++++++++++++++++++++++++++++++++++++
 libswscale/cms.h    | 105 ++++++
 3 files changed, 872 insertions(+)
 create mode 100644 libswscale/cms.c
 create mode 100644 libswscale/cms.h

diff --git a/libswscale/Makefile b/libswscale/Makefile
index 08036634b7..c4e45d494e 100644
--- a/libswscale/Makefile
+++ b/libswscale/Makefile
@@ -6,6 +6,7 @@ HEADERS = swscale.h                                                     \
           version_major.h                                               \
 
 OBJS = alphablend.o                                     \
+       cms.o                                            \
        csputils.o                                       \
        hscale.o                                         \
        hscale_fast_bilinear.o                           \
diff --git a/libswscale/cms.c b/libswscale/cms.c
new file mode 100644
index 0000000000..7793a19b87
--- /dev/null
+++ b/libswscale/cms.c
@@ -0,0 +1,766 @@
+/*
+ * Copyright (C) 2024 Niklas Haas
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <math.h>
+#include <string.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
+#include "libavutil/csp.h"
+#include "libavutil/slicethread.h"
+
+#include "cms.h"
+#include "csputils.h"
+#include "libswscale/swscale.h"
+#include "utils.h"
+
+bool sws_color_map_noop(const SwsColorMap *map)
+{
+    /* If the encoding space is different, we must go through a conversion */
+    if (map->src.prim != map->dst.prim || map->src.trc != map->dst.trc)
+        return false;
+
+    /* If the black point changes, we have to perform black point compensation */
+    if (av_cmp_q(map->src.min_luma, map->dst.min_luma))
+        return false;
+
+    switch (map->intent) {
+    case SWS_INTENT_ABSOLUTE_COLORIMETRIC:
+    case SWS_INTENT_RELATIVE_COLORIMETRIC:
+        return ff_prim_superset(&map->dst.gamut, &map->src.gamut) &&
+               av_cmp_q(map->src.max_luma, map->dst.max_luma) <= 0;
+    case SWS_INTENT_PERCEPTUAL:
+    case SWS_INTENT_SATURATION:
+        return ff_prim_equal(&map->dst.gamut, &map->src.gamut) &&
+               !av_cmp_q(map->src.max_luma, map->dst.max_luma);
+    default:
+        av_assert0(!"Invalid gamut mapping intent?");
+        return true;
+    }
+}
+
+/* Approximation of gamut hull at a given intensity level */
+static const float hull(float I)
+{
+    return ((I - 6.0f) * I + 9.0f) * I;
+}
+
+/* For some minimal type safety, and code cleanliness */
+typedef struct RGB {
+    float R, G, B; /* nits */
+} RGB;
+
+typedef struct IPT {
+    float I, P, T;
+} IPT;
+
+typedef struct ICh {
+    float I, C, h;
+} ICh;
+
+static av_always_inline ICh ipt2ich(IPT c)
+{
+    return (ICh) {
+        .I = c.I,
+        .C = sqrtf(c.P * c.P + c.T * c.T),
+        .h = atan2f(c.T, c.P),
+    };
+}
+
+static av_always_inline IPT ich2ipt(ICh c)
+{
+    return (IPT) {
+        .I = c.I,
+        .P = c.C * cosf(c.h),
+        .T = c.C * sinf(c.h),
+    };
+}
+
+/* Helper struct containing pre-computed cached values describing a gamut */
+typedef struct Gamut {
+    SwsMatrix3x3 encoding2lms;
+    SwsMatrix3x3 lms2encoding;
+    SwsMatrix3x3 lms2content;
+    SwsMatrix3x3 content2lms;
+    av_csp_eotf_function eotf;
+    av_csp_eotf_function eotf_inv;
+    float Iavg_frame;
+    float Imax_frame;
+    float Imin, Imax;
+    float Lb, Lw;
+    AVCIExy wp;
+    ICh peak; /* updated as needed in loop body when hue changes */
+} Gamut;
+
+static Gamut gamut_from_colorspace(SwsColor fmt)
+{
+    const AVColorPrimariesDesc *encoding = av_csp_primaries_desc_from_id(fmt.prim);
+    const AVColorPrimariesDesc content = {
+        .prim = fmt.gamut,
+        .wp   = encoding->wp,
+    };
+
+    const float Lw = av_q2d(fmt.max_luma), Lb = av_q2d(fmt.min_luma);
+    const float Imax = pq_oetf(Lw);
+
+    return (Gamut) {
+        .encoding2lms = ff_sws_ipt_rgb2lms(encoding),
+        .lms2encoding = ff_sws_ipt_lms2rgb(encoding),
+        .lms2content  = ff_sws_ipt_lms2rgb(&content),
+        .content2lms  = ff_sws_ipt_rgb2lms(&content),
+        .eotf         = av_csp_itu_eotf(fmt.trc),
+        .eotf_inv     = av_csp_itu_eotf_inv(fmt.trc),
+        .wp           = encoding->wp,
+        .Imin         = pq_oetf(Lb),
+        .Imax         = Imax,
+        .Imax_frame   = fmt.frame_peak.den ? pq_oetf(av_q2d(fmt.frame_peak)) : Imax,
+        .Iavg_frame   = fmt.frame_avg.den  ? pq_oetf(av_q2d(fmt.frame_avg))  : 0.0f,
+        .Lb           = Lb,
+        .Lw           = Lw,
+    };
+}
+
+static av_always_inline IPT rgb2ipt(RGB c, const SwsMatrix3x3 rgb2lms)
+{
+    const float L = rgb2lms.m[0][0] * c.R +
+                    rgb2lms.m[0][1] * c.G +
+                    rgb2lms.m[0][2] * c.B;
+    const float M = rgb2lms.m[1][0] * c.R +
+                    rgb2lms.m[1][1] * c.G +
+                    rgb2lms.m[1][2] * c.B;
+    const float S = rgb2lms.m[2][0] * c.R +
+                    rgb2lms.m[2][1] * c.G +
+                    rgb2lms.m[2][2] * c.B;
+    const float Lp = pq_oetf(L);
+    const float Mp = pq_oetf(M);
+    const float Sp = pq_oetf(S);
+    return (IPT) {
+        .I = 0.4000f * Lp + 0.4000f * Mp + 0.2000f * Sp,
+        .P = 4.4550f * Lp - 4.8510f * Mp + 0.3960f * Sp,
+        .T = 0.8056f * Lp + 0.3572f * Mp - 1.1628f * Sp,
+    };
+}
+
+static av_always_inline RGB ipt2rgb(IPT c, const SwsMatrix3x3 lms2rgb)
+{
+    const float Lp = c.I + 0.0975689f * c.P + 0.205226f * c.T;
+    const float Mp = c.I - 0.1138760f * c.P + 0.133217f * c.T;
+    const float Sp = c.I + 0.0326151f * c.P - 0.676887f * c.T;
+    const float L = pq_eotf(Lp);
+    const float M = pq_eotf(Mp);
+    const float S = pq_eotf(Sp);
+    return (RGB) {
+        .R = lms2rgb.m[0][0] * L +
+             lms2rgb.m[0][1] * M +
+             lms2rgb.m[0][2] * S,
+        .G = lms2rgb.m[1][0] * L +
+             lms2rgb.m[1][1] * M +
+             lms2rgb.m[1][2] * S,
+        .B = lms2rgb.m[2][0] * L +
+             lms2rgb.m[2][1] * M +
+             lms2rgb.m[2][2] * S,
+    };
+}
+
+static inline bool ingamut(IPT c, Gamut gamut)
+{
+    const float min_rgb = gamut.Lb - 1e-4f;
+    const float max_rgb = gamut.Lw + 1e-2f;
+    const float Lp = c.I + 0.0975689f * c.P + 0.205226f * c.T;
+    const float Mp = c.I - 0.1138760f * c.P + 0.133217f * c.T;
+    const float Sp = c.I + 0.0326151f * c.P - 0.676887f * c.T;
+    if (Lp < gamut.Imin || Lp > gamut.Imax ||
+        Mp < gamut.Imin || Mp > gamut.Imax ||
+        Sp < gamut.Imin || Sp > gamut.Imax)
+    {
+        /* Values outside legal LMS range */
+        return false;
+    } else {
+        const float L = pq_eotf(Lp);
+        const float M = pq_eotf(Mp);
+        const float S = pq_eotf(Sp);
+        RGB rgb = {
+            .R = gamut.lms2content.m[0][0] * L +
+                 gamut.lms2content.m[0][1] * M +
+                 gamut.lms2content.m[0][2] * S,
+            .G = gamut.lms2content.m[1][0] * L +
+                 gamut.lms2content.m[1][1] * M +
+                 gamut.lms2content.m[1][2] * S,
+            .B = gamut.lms2content.m[2][0] * L +
+                 gamut.lms2content.m[2][1] * M +
+                 gamut.lms2content.m[2][2] * S,
+        };
+        return rgb.R >= min_rgb && rgb.R <= max_rgb &&
+               rgb.G >= min_rgb && rgb.G <= max_rgb &&
+               rgb.B >= min_rgb && rgb.B <= max_rgb;
+    }
+}
+
+static const float maxDelta = 5e-5f;
+
+// Find gamut intersection using specified bounds
+static inline ICh
+desat_bounded(float I, float h, float Cmin, float Cmax, Gamut gamut)
+{
+    if (I <= gamut.Imin)
+        return (ICh) { .I = gamut.Imin, .C = 0, .h = h };
+    else if (I >= gamut.Imax)
+        return (ICh) { .I = gamut.Imax, .C = 0, .h = h };
+    else {
+        const float maxDI = I * maxDelta;
+        ICh res = { .I = I, .C = (Cmin + Cmax) / 2, .h = h };
+        do {
+            if (ingamut(ich2ipt(res), gamut)) {
+                Cmin = res.C;
+            } else {
+                Cmax = res.C;
+            }
+            res.C = (Cmin + Cmax) / 2;
+        } while (Cmax - Cmin > maxDI);
+
+        return res;
+    }
+}
+
+// Finds maximally saturated in-gamut color (for given hue)
+static inline ICh saturate(float hue, Gamut gamut)
+{
+    static const float invphi = 0.6180339887498948f;
+    static const float invphi2 = 0.38196601125010515f;
+
+    ICh lo = { .I = gamut.Imin, .h = hue };
+    ICh hi = { .I = gamut.Imax, .h = hue };
+    float de = hi.I - lo.I;
+    ICh a = { .I = lo.I + invphi2 * de };
+    ICh b = { .I = lo.I + invphi  * de };
+    a = desat_bounded(a.I, hue, 0.0f, 0.5f, gamut);
+    b = desat_bounded(b.I, hue, 0.0f, 0.5f, gamut);
+
+    while (de > maxDelta) {
+        de *= invphi;
+        if (a.C > b.C) {
+            hi = b;
+            b = a;
+            a.I = lo.I + invphi2 * de;
+            a = desat_bounded(a.I, hue, lo.C - maxDelta, 0.5f, gamut);
+        } else {
+            lo = a;
+            a = b;
+            b.I = lo.I + invphi * de;
+            b = desat_bounded(b.I, hue, hi.C - maxDelta, 0.5f, gamut);
+        }
+    }
+
+    return a.C > b.C ? a : b;
+}
+
+static float softclip(float value, float source, float target)
+{
+    const float j = SOFTCLIP_KNEE;
+    float peak, x, a, b, scale;
+    if (!target)
+        return 0.0f;
+
+    peak = source / target;
+    x = fminf(value / target, peak);
+    if (x <= j || peak <= 1.0)
+        return value;
+
+    /* Apply simple mobius function */
+    a = -j*j * (peak - 1.0f) / (j*j - 2.0f * j + peak);
+    b = (j*j - 2.0f * j * peak + peak) / fmaxf(1e-6f, peak - 1.0f);
+    scale = (b*b + 2.0f * b*j + j*j) / (b - a);
+
+    return scale * (x + a) / (x + b) * target;
+}
+
+/**
+ * Something like fmixf(base, c, x) but follows an exponential curve, note
+ * that this can be used to extend 'c' outwards for x > 1
+ */
+static inline ICh mix_exp(ICh c, float x, float gamma, float base)
+{
+    return (ICh) {
+        .I = base + (c.I - base) * powf(x, gamma),
+        .C = c.C * x,
+        .h = c.h,
+    };
+}
+
+/**
+ * Drop gamma for colors approaching black and achromatic to avoid numerical
+ * instabilities, and excessive brightness boosting of grain, while also
+ * strongly boosting gamma for values exceeding the target peak
+ */
+static inline float scale_gamma(float gamma, ICh ich, Gamut gamut)
+{
+    const float Imin = gamut.Imin;
+    const float Irel = fmaxf((ich.I - Imin) / (gamut.peak.I - Imin), 0.0f);
+    return gamma * powf(Irel, 3) * fminf(ich.C / gamut.peak.C, 1.0f);
+}
+
+/* Clip a color along the exponential curve given by `gamma` */
+static inline IPT clip_gamma(IPT ipt, float gamma, Gamut gamut)
+{
+    float lo = 0.0f, hi = 1.0f, x = 0.5f;
+    const float maxDI = fmaxf(ipt.I * maxDelta, 1e-7f);
+    ICh ich;
+
+    if (ipt.I <= gamut.Imin)
+        return (IPT) { .I = gamut.Imin };
+    if (ingamut(ipt, gamut))
+        return ipt;
+
+    ich = ipt2ich(ipt);
+    if (!gamma)
+        return ich2ipt(desat_bounded(ich.I, ich.h, 0.0f, ich.C, gamut));
+
+    gamma = scale_gamma(gamma, ich, gamut);
+    do {
+        ICh test = mix_exp(ich, x, gamma, gamut.peak.I);
+        if (ingamut(ich2ipt(test), gamut)) {
+            lo = x;
+        } else {
+            hi = x;
+        }
+        x = (lo + hi) / 2.0f;
+    } while (hi - lo > maxDI);
+
+    return ich2ipt(mix_exp(ich, x, gamma, gamut.peak.I));
+}
+
+typedef struct CmsCtx CmsCtx;
+struct CmsCtx {
+    /* Tone mapping parameters */
+    float Qa, Qb, Qc, Pa, Pb, src_knee, dst_knee; /* perceptual */
+    float I_scale, I_offset; /* linear methods */
+
+    /* Colorspace parameters */
+    Gamut src;
+    Gamut tmp; /* after tone mapping */
+    Gamut dst;
+    SwsMatrix3x3 adaptation; /* for absolute intent */
+
+    /* Invocation parameters */
+    SwsColorMap map;
+    float (*tone_map)(const CmsCtx *ctx, float I);
+    IPT (*adapt_colors)(const CmsCtx *ctx, IPT ipt);
+    v3u16_t *input;
+    v3u16_t *output;
+
+    /* Threading parameters */
+    int slice_size;
+    int size_input;
+    int size_output_I;
+    int size_output_PT;
+};
+
+/**
+ * Helper function to pick a knee point based on the * HDR10+ brightness
+ * metadata and scene brightness average matching.
+ *
+ * Inspired by SMPTE ST2094-10, with some modifications
+ */
+static void st2094_pick_knee(float src_max, float src_min, float src_avg,
+                             float dst_max, float dst_min,
+                             float *out_src_knee, float *out_dst_knee)
+{
+    const float min_knee = PERCEPTUAL_KNEE_MIN;
+    const float max_knee = PERCEPTUAL_KNEE_MAX;
+    const float def_knee = PERCEPTUAL_KNEE_DEF;
+    const float src_knee_min = fmixf(src_min, src_max, min_knee);
+    const float src_knee_max = fmixf(src_min, src_max, max_knee);
+    const float dst_knee_min = fmixf(dst_min, dst_max, min_knee);
+    const float dst_knee_max = fmixf(dst_min, dst_max, max_knee);
+    float src_knee, target, adapted, tuning, adaptation, dst_knee;
+
+    /* Choose source knee based on dynamic source scene brightness */
+    src_knee = src_avg ? src_avg : fmixf(src_min, src_max, def_knee);
+    src_knee = av_clipf(src_knee, src_knee_min, src_knee_max);
+
+    /* Choose target adaptation point based on linearly re-scaling source knee */
+    target = (src_knee - src_min) / (src_max - src_min);
+    adapted = fmixf(dst_min, dst_max, target);
+
+    /**
+     * Choose the destnation knee by picking the perceptual adaptation point
+     * between the source knee and the desired target. This moves the knee
+     * point, on the vertical axis, closer to the 1:1 (neutral) line.
+     *
+     * Adjust the adaptation strength towards 1 based on how close the knee
+     * point is to its extreme values (min/max knee)
+     */
+    tuning = smoothstepf(max_knee, def_knee, target) *
+             smoothstepf(min_knee, def_knee, target);
+    adaptation = fmixf(1.0f, PERCEPTUAL_ADAPTATION, tuning);
+    dst_knee = fmixf(src_knee, adapted, adaptation);
+    dst_knee = av_clipf(dst_knee, dst_knee_min, dst_knee_max);
+
+    *out_src_knee = src_knee;
+    *out_dst_knee = dst_knee;
+}
+
+static void tone_map_setup(CmsCtx *ctx, bool dynamic)
+{
+    const float dst_min = ctx->dst.Imin;
+    const float dst_max = ctx->dst.Imax;
+    const float src_min = ctx->src.Imin;
+    const float src_max = dynamic ? ctx->src.Imax_frame : ctx->src.Imax;
+    const float src_avg = dynamic ? ctx->src.Iavg_frame : 0.0f;
+    float slope, ratio, in_min, in_max, out_min, out_max, t;
+
+    switch (ctx->map.intent) {
+    case SWS_INTENT_PERCEPTUAL:
+        st2094_pick_knee(src_max, src_min, src_avg, dst_max, dst_min,
+                         &ctx->src_knee, &ctx->dst_knee);
+
+        /* Solve for linear knee (Pa = 0) */
+        slope = (ctx->dst_knee - dst_min) / (ctx->src_knee - src_min);
+
+        /**
+         * Tune the slope at the knee point slightly: raise it to a user-provided
+         * gamma exponent, multiplied by an extra tuning coefficient designed to
+         * make the slope closer to 1.0 when the difference in peaks is low, and
+         * closer to linear when the difference between peaks is high.
+         */
+        ratio = src_max / dst_max - 1.0f;
+        ratio = av_clipf(SLOPE_TUNING * ratio, SLOPE_OFFSET, 1.0f + SLOPE_OFFSET);
+        slope = powf(slope, (1.0f - PERCEPTUAL_CONTRAST) * ratio);
+
+        /* Normalize everything the pivot to make the math easier */
+        in_min  = src_min - ctx->src_knee;
+        in_max  = src_max - ctx->src_knee;
+        out_min = dst_min - ctx->dst_knee;
+        out_max = dst_max - ctx->dst_knee;
+
+        /**
+         * Solve P of order 2 for:
+         *  P(in_min) = out_min
+         *  P'(0.0) = slope
+         *  P(0.0) = 0.0
+         */
+        ctx->Pa = (out_min - slope * in_min) / (in_min * in_min);
+        ctx->Pb = slope;
+
+        /**
+         * Solve Q of order 3 for:
+         *  Q(in_max) = out_max
+         *  Q''(in_max) = 0.0
+         *  Q(0.0) = 0.0
+         *  Q'(0.0) = slope
+         */
+        t = 2 * in_max * in_max;
+        ctx->Qa = (slope * in_max - out_max) / (in_max * t);
+        ctx->Qb = -3 * (slope * in_max - out_max) / t;
+        ctx->Qc = slope;
+        break;
+    case SWS_INTENT_SATURATION:
+        /* Linear stretch */
+        ctx->I_scale = (dst_max - dst_min) / (src_max - src_min);
+        ctx->I_offset = dst_min - src_min * ctx->I_scale;
+        break;
+    case SWS_INTENT_RELATIVE_COLORIMETRIC:
+        /* Pure black point adaptation */
+        ctx->I_scale = src_max / (src_max - src_min) /
+                      (dst_max / (dst_max - dst_min));
+        ctx->I_offset = dst_min - src_min * ctx->I_scale;
+        break;
+    case SWS_INTENT_ABSOLUTE_COLORIMETRIC:
+        /* Hard clip */
+        ctx->I_scale = 1.0f;
+        ctx->I_offset = 0.0f;
+        break;
+    }
+}
+
+static av_always_inline IPT tone_map_apply(const CmsCtx *ctx, IPT ipt)
+{
+    float I = ipt.I, desat;
+
+    if (ctx->map.intent == SWS_INTENT_PERCEPTUAL) {
+        const float Pa = ctx->Pa, Pb = ctx->Pb;
+        const float Qa = ctx->Qa, Qb = ctx->Qb, Qc = ctx->Qc;
+        I -= ctx->src_knee;
+        I = I > 0 ? ((Qa * I + Qb) * I + Qc) * I : (Pa * I + Pb) * I;
+        I += ctx->dst_knee;
+    } else {
+        I = ctx->I_scale * I + ctx->I_offset;
+    }
+
+    /**
+     * Avoids raising saturation excessively when raising brightness, and
+     * also desaturates when reducing brightness greatly to account for the
+     * reduction in gamut volume.
+     */
+    desat = fminf(ipt.I / I, hull(I) / hull(ipt.I));
+    return (IPT) {
+        .I = I,
+        .P = ipt.P * desat,
+        .T = ipt.T * desat,
+    };
+}
+
+static IPT perceptual(const CmsCtx *ctx, IPT ipt)
+{
+    ICh ich = ipt2ich(ipt);
+    IPT mapped = rgb2ipt(ipt2rgb(ipt, ctx->tmp.lms2content), ctx->dst.content2lms);
+    RGB rgb;
+    float maxRGB;
+
+    /* Protect in gamut region */
+    const float maxC = fmaxf(ctx->tmp.peak.C, ctx->dst.peak.C);
+    float k = smoothstepf(PERCEPTUAL_DEADZONE, 1.0f, ich.C / maxC);
+    k *= PERCEPTUAL_STRENGTH;
+    ipt.I = fmixf(ipt.I, mapped.I, k);
+    ipt.P = fmixf(ipt.P, mapped.P, k);
+    ipt.T = fmixf(ipt.T, mapped.T, k);
+
+    rgb = ipt2rgb(ipt, ctx->dst.lms2content);
+    maxRGB = fmaxf(rgb.R, fmaxf(rgb.G, rgb.B));
+    rgb.R = fmaxf(softclip(rgb.R, maxRGB, ctx->dst.Lw), ctx->dst.Lb);
+    rgb.G = fmaxf(softclip(rgb.G, maxRGB, ctx->dst.Lw), ctx->dst.Lb);
+    rgb.B = fmaxf(softclip(rgb.B, maxRGB, ctx->dst.Lw), ctx->dst.Lb);
+
+    return rgb2ipt(rgb, ctx->dst.content2lms);
+}
+
+static IPT relative(const CmsCtx *ctx, IPT ipt)
+{
+    return clip_gamma(ipt, COLORIMETRIC_GAMMA, ctx->dst);
+}
+
+static IPT absolute(const CmsCtx *ctx, IPT ipt)
+{
+    RGB rgb = ipt2rgb(ipt, ctx->dst.lms2encoding);
+    float c[3] = { rgb.R, rgb.G, rgb.B };
+    ff_sws_matrix3x3_apply(&ctx->adaptation, c);
+    ipt = rgb2ipt((RGB) { c[0], c[1], c[2] }, ctx->dst.encoding2lms);
+
+    return clip_gamma(ipt, COLORIMETRIC_GAMMA, ctx->dst);
+}
+
+static IPT saturation(const CmsCtx * ctx, IPT ipt)
+{
+    RGB rgb = ipt2rgb(ipt, ctx->tmp.lms2content);
+    return rgb2ipt(rgb, ctx->dst.content2lms);
+}
+
+static av_always_inline av_const uint16_t av_round16f(float x)
+{
+    return av_clip_uint16(x * (UINT16_MAX - 1) + 0.5f);
+}
+
+/* Call this whenever the hue changes inside the loop body */
+static av_always_inline void update_hue_peaks(CmsCtx *ctx, float P, float T)
+{
+    const float hue = atan2f(T, P);
+    switch (ctx->map.intent) {
+    case SWS_INTENT_PERCEPTUAL:
+        ctx->tmp.peak = saturate(hue, ctx->tmp);
+        /* fall through */
+    case SWS_INTENT_RELATIVE_COLORIMETRIC:
+    case SWS_INTENT_ABSOLUTE_COLORIMETRIC:
+        ctx->dst.peak = saturate(hue, ctx->dst);
+        return;
+    default:
+        return;
+    }
+}
+
+static void generate_slice(void *priv, int jobnr, int threadnr, int nb_jobs,
+                           int nb_threads)
+{
+    CmsCtx ctx = *(const CmsCtx *) priv;
+
+    const int slice_start  = jobnr * ctx.slice_size;
+    const int slice_stride = ctx.size_input * ctx.size_input;
+    const int slice_end    = FFMIN((jobnr + 1) * ctx.slice_size, ctx.size_input);
+    v3u16_t *input = &ctx.input[slice_start * slice_stride];
+
+    const int output_slice_h = (ctx.size_output_PT + nb_jobs - 1) / nb_jobs;
+    const int output_start   = jobnr * output_slice_h;
+    const int output_stride  = ctx.size_output_PT * ctx.size_output_I;
+    const int output_end     = FFMIN((jobnr + 1) * output_slice_h, ctx.size_output_PT);
+    v3u16_t *output = ctx.output ? &ctx.output[output_start * output_stride] : NULL;
+
+    const float I_scale   = 1.0f / (ctx.src.Imax - ctx.src.Imin);
+    const float I_offset  = -ctx.src.Imin * I_scale;
+    const float PT_offset = (float) (1 << 15) / (UINT16_MAX - 1);
+
+    const float input_scale     = 1.0f / (ctx.size_input - 1);
+    const float output_scale_PT = 1.0f / (ctx.size_output_PT - 1);
+    const float output_scale_I  = (ctx.tmp.Imax - ctx.tmp.Imin) /
+                                  (ctx.size_output_I - 1);
+
+    for (int Bx = slice_start; Bx < slice_end; Bx++) {
+        const float B = input_scale * Bx;
+        for (int Gx = 0; Gx < ctx.size_input; Gx++) {
+            const float G = input_scale * Gx;
+            for (int Rx = 0; Rx < ctx.size_input; Rx++) {
+                double c[3] = { input_scale * Rx, G, B };
+                RGB rgb;
+                IPT ipt;
+
+                ctx.src.eotf(ctx.src.Lw, ctx.src.Lb, c);
+                rgb = (RGB) { c[0], c[1], c[2] };
+                ipt = rgb2ipt(rgb, ctx.src.encoding2lms);
+
+                if (output) {
+                    /* Save intermediate value to 3DLUT */
+                    *input++ = (v3u16_t) {
+                        av_round16f(I_scale * ipt.I + I_offset),
+                        av_round16f(ipt.P + PT_offset),
+                        av_round16f(ipt.T + PT_offset),
+                    };
+                } else {
+                    update_hue_peaks(&ctx, ipt.P, ipt.T);
+
+                    ipt = tone_map_apply(&ctx, ipt);
+                    ipt = ctx.adapt_colors(&ctx, ipt);
+                    rgb = ipt2rgb(ipt, ctx.dst.lms2encoding);
+
+                    c[0] = rgb.R;
+                    c[1] = rgb.G;
+                    c[2] = rgb.B;
+                    ctx.dst.eotf_inv(ctx.dst.Lw, ctx.dst.Lb, c);
+                    *input++ = (v3u16_t) {
+                        av_round16f(c[0]),
+                        av_round16f(c[1]),
+                        av_round16f(c[2]),
+                    };
+                }
+            }
+        }
+    }
+
+    if (!output)
+        return;
+
+    /* Generate split gamut mapping LUT */
+    for (int Tx = output_start; Tx < output_end; Tx++) {
+        const float T = output_scale_PT * Tx - PT_offset;
+        for (int Px = 0; Px < ctx.size_output_PT; Px++) {
+            const float P = output_scale_PT * Px - PT_offset;
+            update_hue_peaks(&ctx, P, T);
+
+            for (int Ix = 0; Ix < ctx.size_output_I; Ix++) {
+                const float I = output_scale_I * Ix + ctx.tmp.Imin;
+                IPT ipt = ctx.adapt_colors(&ctx, (IPT) { I, P, T });
+                RGB rgb = ipt2rgb(ipt, ctx.dst.lms2encoding);
+                double c[3] = { rgb.R, rgb.G, rgb.B };
+                ctx.dst.eotf_inv(ctx.dst.Lw, ctx.dst.Lb, c);
+                *output++ = (v3u16_t) {
+                    av_round16f(c[0]),
+                    av_round16f(c[1]),
+                    av_round16f(c[2]),
+                };
+            }
+        }
+    }
+}
+
+int sws_color_map_generate_static(v3u16_t *lut, int size, const SwsColorMap *map)
+{
+    return sws_color_map_generate_dynamic(lut, NULL, size, 1, 1, map);
+}
+
+int sws_color_map_generate_dynamic(v3u16_t *input, v3u16_t *output,
+                                   int size_input, int size_I, int size_PT,
+                                   const SwsColorMap *map)
+{
+    AVSliceThread *slicethread;
+    int ret, num_slices;
+
+    CmsCtx ctx = {
+        .map            = *map,
+        .input          = input,
+        .output         = output,
+        .size_input     = size_input,
+        .size_output_I  = size_I,
+        .size_output_PT = size_PT,
+        .src            = gamut_from_colorspace(map->src),
+        .dst            = gamut_from_colorspace(map->dst),
+    };
+
+    switch (ctx.map.intent) {
+    case SWS_INTENT_PERCEPTUAL:            ctx.adapt_colors = perceptual; break;
+    case SWS_INTENT_RELATIVE_COLORIMETRIC: ctx.adapt_colors = relative;   break;
+    case SWS_INTENT_SATURATION:            ctx.adapt_colors = saturation; break;
+    case SWS_INTENT_ABSOLUTE_COLORIMETRIC: ctx.adapt_colors = absolute;   break;
+    default: return AVERROR(EINVAL);
+    }
+
+    if (!output) {
+        /* Tone mapping is handled in a separate step when using dynamic TM */
+        tone_map_setup(&ctx, false);
+    }
+
+    /* Intermediate color space after tone mapping */
+    ctx.tmp      = ctx.src;
+    ctx.tmp.Lb   = ctx.dst.Lb;
+    ctx.tmp.Lw   = ctx.dst.Lw;
+    ctx.tmp.Imin = ctx.dst.Imin;
+    ctx.tmp.Imax = ctx.dst.Imax;
+
+    if (ctx.map.intent == SWS_INTENT_ABSOLUTE_COLORIMETRIC) {
+        /**
+         * The IPT transform already implies an explicit white point adaptation
+         * from src to dst, so to get absolute colorimetric semantics we have
+         * to explicitly undo this adaptation with a * corresponding inverse.
+         */
+        ctx.adaptation = ff_sws_get_adaptation(&ctx.map.dst.gamut,
+                                               ctx.dst.wp, ctx.src.wp);
+    }
+
+    ret = avpriv_slicethread_create(&slicethread, &ctx, generate_slice, NULL, 0);
+    if (ret < 0)
+        return ret;
+
+    ctx.slice_size = (ctx.size_input + ret - 1) / ret;
+    num_slices = (ctx.size_input + ctx.slice_size - 1) / ctx.slice_size;
+    avpriv_slicethread_execute(slicethread, num_slices, 0);
+    avpriv_slicethread_free(&slicethread);
+    return 0;
+}
+
+void sws_tone_map_generate(v2u16_t *lut, int size, const SwsColorMap *map)
+{
+    CmsCtx ctx = {
+        .map = *map,
+        .src = gamut_from_colorspace(map->src),
+        .dst = gamut_from_colorspace(map->dst),
+    };
+
+    const float src_scale  = (ctx.src.Imax - ctx.src.Imin) / (size - 1);
+    const float src_offset = ctx.src.Imin;
+    const float dst_scale  = 1.0f / (ctx.dst.Imax - ctx.dst.Imin);
+    const float dst_offset = -ctx.dst.Imin * dst_scale;
+
+    tone_map_setup(&ctx, true);
+
+    for (int i = 0; i < size; i++) {
+        const float I = src_scale * i + src_offset;
+        IPT ipt = tone_map_apply(&ctx, (IPT) { I, 1.0f });
+        lut[i] = (v2u16_t) {
+            av_round16f(dst_scale * ipt.I + dst_offset),
+            av_clip_uint16(ipt.P * (1 << 15) + 0.5f),
+        };
+    }
+}
diff --git a/libswscale/cms.h b/libswscale/cms.h
new file mode 100644
index 0000000000..542b25e92e
--- /dev/null
+++ b/libswscale/cms.h
@@ -0,0 +1,105 @@
+ /*
+  * Copyright (C) 2024 Niklas Haas
+  *
+  * This file is part of FFmpeg.
+  *
+  * FFmpeg is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public
+  * License as published by the Free Software Foundation; either
+  * version 2.1 of the License, or (at your option) any later version.
+  *
+  * FFmpeg is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with FFmpeg; if not, write to the Free Software
+  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+  */
+
+#ifndef SWSCALE_CMS_H
+#define SWSCALE_CMS_H
+
+#include <stdbool.h>
+
+#include "libavutil/csp.h"
+
+#include "csputils.h"
+#include "swscale.h"
+#include "utils.h"
+
+/* Minimum, maximum, and default knee point for perceptual tone mapping [0,1] */
+#define PERCEPTUAL_KNEE_MIN 0.10f
+#define PERCEPTUAL_KNEE_MAX 0.80f
+#define PERCEPTUAL_KNEE_DEF 0.40f
+
+/* Ratio between source average and target average. */
+#define PERCEPTUAL_ADAPTATION 0.40f
+
+/* (Relative) chromaticity protection zone for perceptual mapping [0,1] */
+#define PERCEPTUAL_DEADZONE 0.30f
+
+/* Contrast setting for perceptual tone mapping. [0,1.5] */
+#define PERCEPTUAL_CONTRAST 0.50f
+
+/* Tuning constants for overriding the contrast near extremes */
+#define SLOPE_TUNING 1.50f /* [0,10] */
+#define SLOPE_OFFSET 0.20f /* [0,1] */
+
+/* Strength of the perceptual saturation mapping component [0,1] */
+#define PERCEPTUAL_STRENGTH 0.80f
+
+/* Knee point to use for perceptual soft clipping [0,1] */
+#define SOFTCLIP_KNEE 0.70f
+
+/* I vs C curve gamma to use for colorimetric clipping [0,10] */
+#define COLORIMETRIC_GAMMA 1.80f
+
+/* Struct describing a color mapping operation */
+typedef struct SwsColorMap {
+    SwsColor src;
+    SwsColor dst;
+    SwsIntent intent;
+} SwsColorMap;
+
+/**
+ * Returns true if the given color map is a semantic no-op - that is,
+ * the overall RGB end to end transform would an identity mapping.
+ */
+bool sws_color_map_noop(const SwsColorMap *map);
+
+/**
+ * Generates a single end-to-end color mapping 3DLUT embedding a static tone
+ * mapping curve.
+ *
+ * Returns 0 on success, or a negative error code on failure.
+ */
+int sws_color_map_generate_static(v3u16_t *lut, int size, const SwsColorMap *map);
+
+/**
+ * Generates a split pair of 3DLUTS, going to IPT and back, allowing an
+ * arbitrary dynamic EETF to be nestled in between these two operations.
+ *
+ * See sws_tone_map_generate().
+ *
+ * Returns 0 on success, or a negative error code on failure.
+ */
+int sws_color_map_generate_dynamic(v3u16_t *input, v3u16_t *output,
+                                   int size_input, int size_I, int size_PT,
+                                   const SwsColorMap *map);
+
+/**
+ * Generate a 1D LUT of size `size` adapting intensity (I) levels from the
+ * source to the destination color space. The LUT is normalized to the
+ * relevant intensity range directly. The second channel of each entry returns
+ * the corresponding 15-bit scaling factor for the P/T channels. The scaling
+ * factor k may be applied as `(1 << 15) - k + (PT * k >> 15)`.
+ *
+ * This is designed to be used with sws_gamut_map_generate_dynamic().
+ *
+ * Returns 0 on success, or a negative error code on failure.
+ */
+void sws_tone_map_generate(v2u16_t *lut, int size, const SwsColorMap *map);
+
+#endif // SWSCALE_GAMUT_MAPPING_H
-- 
2.47.0