[FFmpeg-cvslog] lavu/tx: refactor to explicitly track and convert lookup table order

Lynne git at videolan.org
Thu Nov 24 16:59:46 EET 2022


ffmpeg | branch: master | Lynne <dev at lynne.ee> | Sat Nov 19 00:47:45 2022 +0100| [87bae6b0189d5cb71b836890078f96a4d1abd277] | committer: Lynne

lavu/tx: refactor to explicitly track and convert lookup table order

Necessary for generalizing PFAs.

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=87bae6b0189d5cb71b836890078f96a4d1abd277
---

 libavutil/aarch64/tx_float_init.c |   5 +-
 libavutil/tx.c                    | 109 +++++++++++++++++++++++++++++---------
 libavutil/tx_priv.h               |  52 ++++++++++++++++--
 libavutil/tx_template.c           |  49 ++++++++---------
 libavutil/x86/tx_float_init.c     |  46 ++++++++--------
 5 files changed, 181 insertions(+), 80 deletions(-)

diff --git a/libavutil/aarch64/tx_float_init.c b/libavutil/aarch64/tx_float_init.c
index e7b73b4bf9..8300472c4c 100644
--- a/libavutil/aarch64/tx_float_init.c
+++ b/libavutil/aarch64/tx_float_init.c
@@ -37,12 +37,11 @@ static av_cold int neon_init(AVTXContext *s, const FFTXCodelet *cd,
                              uint64_t flags, FFTXCodeletOptions *opts,
                              int len, int inv, const void *scale)
 {
-    const int inv_lookup = opts ? opts->invert_lookup : 1;
     ff_tx_init_tabs_float(len);
     if (cd->max_len == 2)
-        return ff_tx_gen_ptwo_revtab(s, inv_lookup);
+        return ff_tx_gen_ptwo_revtab(s, opts);
     else
-        return ff_tx_gen_split_radix_parity_revtab(s, len, inv, inv_lookup, 8, 0);
+        return ff_tx_gen_split_radix_parity_revtab(s, len, inv, opts, 8, 0);
 }
 
 const FFTXCodelet * const ff_tx_codelet_list_float_aarch64[] = {
diff --git a/libavutil/tx.c b/libavutil/tx.c
index ff81d235ba..8027e983ba 100644
--- a/libavutil/tx.c
+++ b/libavutil/tx.c
@@ -39,11 +39,41 @@ static av_always_inline int mulinv(int n, int m)
     return 0;
 }
 
+int ff_tx_gen_pfa_input_map(AVTXContext *s, FFTXCodeletOptions *opts,
+                            int d1, int d2)
+{
+    const int sl = d1*d2;
+
+    s->map = av_malloc(s->len*sizeof(*s->map));
+    if (!s->map)
+        return AVERROR(ENOMEM);
+
+    for (int k = 0; k < s->len; k += sl) {
+        if (s->inv || (opts && opts->map_dir == FF_TX_MAP_SCATTER)) {
+            for (int m = 0; m < d2; m++)
+                for (int n = 0; n < d1; n++)
+                    s->map[k + ((m*d1 + n*d2) % (sl))] = m*d1 + n;
+        } else {
+            for (int m = 0; m < d2; m++)
+                for (int n = 0; n < d1; n++)
+                    s->map[k + m*d1 + n] = (m*d1 + n*d2) % (sl);
+        }
+
+        if (s->inv)
+            for (int w = 1; w <= ((sl) >> 1); w++)
+                FFSWAP(int, s->map[k + w], s->map[k + sl - w]);
+    }
+
+    s->map_dir = opts ? opts->map_dir : FF_TX_MAP_GATHER;
+
+    return 0;
+}
+
 /* Guaranteed to work for any n, m where gcd(n, m) == 1 */
-int ff_tx_gen_compound_mapping(AVTXContext *s, int n, int m)
+int ff_tx_gen_compound_mapping(AVTXContext *s, FFTXCodeletOptions *opts,
+                               int inv, int n, int m)
 {
     int *in_map, *out_map;
-    const int inv = s->inv;
     const int len = n*m;    /* Will not be equal to s->len for MDCTs */
     int m_inv, n_inv;
 
@@ -61,14 +91,22 @@ int ff_tx_gen_compound_mapping(AVTXContext *s, int n, int m)
     out_map = s->map + len;
 
     /* Ruritanian map for input, CRT map for output, can be swapped */
-    for (int j = 0; j < m; j++) {
-        for (int i = 0; i < n; i++) {
-            in_map[j*n + i] = (i*m + j*n) % len;
-            out_map[(i*m*m_inv + j*n*n_inv) % len] = i*m + j;
+    if (opts && opts->map_dir == FF_TX_MAP_SCATTER) {
+        for (int j = 0; j < m; j++) {
+            for (int i = 0; i < n; i++) {
+                in_map[(i*m + j*n) % len] = j*n + i;
+                out_map[(i*m*m_inv + j*n*n_inv) % len] = i*m + j;
+            }
+        }
+    } else {
+        for (int j = 0; j < m; j++) {
+            for (int i = 0; i < n; i++) {
+                in_map[j*n + i] = (i*m + j*n) % len;
+                out_map[(i*m*m_inv + j*n*n_inv) % len] = i*m + j;
+            }
         }
     }
 
-    /* Change transform direction by reversing all ACs */
     if (inv) {
         for (int i = 0; i < m; i++) {
             int *in = &in_map[i*n + 1]; /* Skip the DC */
@@ -77,17 +115,7 @@ int ff_tx_gen_compound_mapping(AVTXContext *s, int n, int m)
         }
     }
 
-    /* Our 15-point transform is also a compound one, so embed its input map */
-    if (n == 15) {
-        for (int k = 0; k < m; k++) {
-            int tmp[15];
-            memcpy(tmp, &in_map[k*15], 15*sizeof(*tmp));
-            for (int i = 0; i < 5; i++) {
-                for (int j = 0; j < 3; j++)
-                    in_map[k*15 + i*3 + j] = tmp[(i*3 + j*5) % 15];
-            }
-        }
-    }
+    s->map_dir = opts ? opts->map_dir : FF_TX_MAP_GATHER;
 
     return 0;
 }
@@ -103,21 +131,23 @@ static inline int split_radix_permutation(int i, int len, int inv)
     return split_radix_permutation(i, len, inv) * 4 + 1 - 2*(!(i & len) ^ inv);
 }
 
-int ff_tx_gen_ptwo_revtab(AVTXContext *s, int invert_lookup)
+int ff_tx_gen_ptwo_revtab(AVTXContext *s, FFTXCodeletOptions *opts)
 {
     int len = s->len;
 
     if (!(s->map = av_malloc(len*sizeof(*s->map))))
         return AVERROR(ENOMEM);
 
-    if (invert_lookup) {
+    if (opts && opts->map_dir == FF_TX_MAP_SCATTER) {
         for (int i = 0; i < s->len; i++)
-            s->map[i] = -split_radix_permutation(i, len, s->inv) & (len - 1);
+            s->map[-split_radix_permutation(i, len, s->inv) & (len - 1)] = i;
     } else {
         for (int i = 0; i < s->len; i++)
-            s->map[-split_radix_permutation(i, len, s->inv) & (len - 1)] = i;
+            s->map[i] = -split_radix_permutation(i, len, s->inv) & (len - 1);
     }
 
+    s->map_dir = opts ? opts->map_dir : FF_TX_MAP_GATHER;
+
     return 0;
 }
 
@@ -207,7 +237,8 @@ static void parity_revtab_generator(int *revtab, int n, int inv, int offset,
 }
 
 int ff_tx_gen_split_radix_parity_revtab(AVTXContext *s, int len, int inv,
-                                        int inv_lookup, int basis, int dual_stride)
+                                        FFTXCodeletOptions *opts,
+                                        int basis, int dual_stride)
 {
     basis >>= 1;
     if (len < basis)
@@ -220,7 +251,10 @@ int ff_tx_gen_split_radix_parity_revtab(AVTXContext *s, int len, int inv,
     av_assert0(dual_stride <= basis);
 
     parity_revtab_generator(s->map, len, inv, 0, 0, 0, len,
-                            basis, dual_stride, inv_lookup != 0);
+                            basis, dual_stride,
+                            opts ? opts->map_dir == FF_TX_MAP_GATHER : FF_TX_MAP_GATHER);
+
+    s->map_dir = opts ? opts->map_dir : FF_TX_MAP_GATHER;
 
     return 0;
 }
@@ -656,6 +690,33 @@ av_cold int ff_tx_init_subtx(AVTXContext *s, enum AVTXType type,
             ret = cd->init(sctx, cd, flags, opts, len, inv, scale);
 
         if (ret >= 0) {
+            if (opts && opts->map_dir != FF_TX_MAP_NONE &&
+                sctx->map_dir == FF_TX_MAP_NONE) {
+                /* If a specific map direction was requested, and it doesn't
+                 * exist, create one.*/
+                sctx->map = av_malloc(len*sizeof(*sctx->map));
+                if (!sctx->map) {
+                    ret = AVERROR(ENOMEM);
+                    goto end;
+                }
+
+                for (int i = 0; i < len; i++)
+                    sctx->map[i] = i;
+            } else if (opts && (opts->map_dir != sctx->map_dir)) {
+                int *tmp = av_malloc(len*sizeof(*sctx->map));
+                if (!tmp) {
+                    ret = AVERROR(ENOMEM);
+                    goto end;
+                }
+
+                memcpy(tmp, sctx->map, len*sizeof(*sctx->map));
+
+                for (int i = 0; i < len; i++)
+                    sctx->map[tmp[i]] = i;
+
+                free(tmp);
+            }
+
             s->nb_sub++;
             goto end;
         }
diff --git a/libavutil/tx_priv.h b/libavutil/tx_priv.h
index 80d045f6af..207f79dfb8 100644
--- a/libavutil/tx_priv.h
+++ b/libavutil/tx_priv.h
@@ -158,10 +158,23 @@ typedef enum FFTXCodeletPriority {
     FF_TX_PRIO_MAX          =  32768,  /* For custom implementations/ASICs */
 } FFTXCodeletPriority;
 
+typedef enum FFTXMapDirection {
+    /* No map. Make a map up. */
+    FF_TX_MAP_NONE = 0,
+
+    /* Lookup table must be applied via dst[i] = src[lut[i]]; */
+    FF_TX_MAP_GATHER,
+
+    /* Lookup table must be applied via dst[lut[i]] = src[i]; */
+    FF_TX_MAP_SCATTER,
+} FFTXMapDirection;
+
 /* Codelet options */
 typedef struct FFTXCodeletOptions {
-    int invert_lookup;     /* If codelet is flagged as FF_TX_CODELET_PRESHUFFLE,
-                              invert the lookup direction for the map generated */
+    /* Request a specific lookup table direction. Codelets MUST put the
+     * direction in AVTXContext. If the codelet does not respect this, a
+     * conversion will be performed. */
+    FFTXMapDirection map_dir;
 } FFTXCodeletOptions;
 
 /* Maximum number of factors a codelet may have. Arbitrary. */
@@ -234,11 +247,32 @@ struct AVTXContext {
     enum AVTXType      type;            /* Type of transform */
     uint64_t           flags;           /* A combination of AVTXFlags and
                                          * codelet flags used when creating */
+    FFTXMapDirection   map_dir;         /* Direction of AVTXContext->map */
     float              scale_f;
     double             scale_d;
     void              *opaque;          /* Free to use by implementations */
 };
 
+/* This function embeds a Ruritanian PFA input map into an existing lookup table
+ * to avoid double permutation. This allows for compound factors to be
+ * synthesized as fast PFA FFTs and embedded into either other or standalone
+ * transforms.
+ * The output CRT map must still be pre-baked into the transform. */
+#define TX_EMBED_INPUT_PFA_MAP(map, tot_len, d1, d2)                             \
+    do {                                                                         \
+        int mtmp[(d1)*(d2)];                                                     \
+        for (int k = 0; k < tot_len; k += (d1)*(d2)) {                           \
+            memcpy(mtmp, &map[k], (d1)*(d2)*sizeof(*mtmp));                      \
+            for (int m = 0; m < (d2); m++)                                       \
+                for (int n = 0; n < (d1); n++)                                   \
+                    map[k + m*(d1) + n] = mtmp[(m*(d1) + n*(d2)) % ((d1)*(d2))]; \
+        }                                                                        \
+    } while (0)
+
+/* This function generates a Ruritanian PFA input map into s->map. */
+int ff_tx_gen_pfa_input_map(AVTXContext *s, FFTXCodeletOptions *opts,
+                            int d1, int d2);
+
 /* Create a subtransform in the current context with the given parameters.
  * The flags parameter from FFTXCodelet.init() should be preserved as much
  * as that's possible.
@@ -250,11 +284,18 @@ int ff_tx_init_subtx(AVTXContext *s, enum AVTXType type,
 /* Clear the context by freeing all tables, maps and subtransforms. */
 void ff_tx_clear_ctx(AVTXContext *s);
 
+/* Generate a default map (0->len or 0, (len-1)->1 for inverse transforms)
+ * for a context. */
+int ff_tx_gen_default_map(AVTXContext *s, FFTXCodeletOptions *opts);
+
 /*
  * Generates the PFA permutation table into AVTXContext->pfatab. The end table
  * is appended to the start table.
+ * The `inv` flag should only be enabled if the lookup tables of subtransforms
+ * won't get flattened.
  */
-int ff_tx_gen_compound_mapping(AVTXContext *s, int n, int m);
+int ff_tx_gen_compound_mapping(AVTXContext *s, FFTXCodeletOptions *opts,
+                               int inv, int n, int m);
 
 /*
  * Generates a standard-ish (slightly modified) Split-Radix revtab into
@@ -262,7 +303,7 @@ int ff_tx_gen_compound_mapping(AVTXContext *s, int n, int m);
  * If it's set to 0, it has to be applied like out[map[i]] = in[i], otherwise
  * if it's set to 1, has to be applied as out[i] = in[map[i]]
  */
-int ff_tx_gen_ptwo_revtab(AVTXContext *s, int invert_lookup);
+int ff_tx_gen_ptwo_revtab(AVTXContext *s, FFTXCodeletOptions *opts);
 
 /*
  * Generates an index into AVTXContext->inplace_idx that if followed in the
@@ -303,7 +344,8 @@ int ff_tx_gen_inplace_map(AVTXContext *s, int len);
  * to out[i] = src[map[i]].
  */
 int ff_tx_gen_split_radix_parity_revtab(AVTXContext *s, int len, int inv,
-                                        int inv_lookup, int basis, int dual_stride);
+                                        FFTXCodeletOptions *opts,
+                                        int basis, int dual_stride);
 
 /* Typed init function to initialize shared tables. Will initialize all tables
  * for all factors of a length. */
diff --git a/libavutil/tx_template.c b/libavutil/tx_template.c
index c157719d73..38ab517f66 100644
--- a/libavutil/tx_template.c
+++ b/libavutil/tx_template.c
@@ -479,30 +479,15 @@ static av_cold int TX_NAME(ff_tx_fft_factor_init)(AVTXContext *s,
                                                   int len, int inv,
                                                   const void *scale)
 {
+    int ret = 0;
     TX_TAB(ff_tx_init_tabs)(len);
 
-    if (flags & FF_TX_PRESHUFFLE) {
-        s->map = av_malloc(len*sizeof(s->map));
-        s->map[0] = 0; /* DC is always at the start */
-        if (inv) /* Reversing the ACs flips the transform direction */
-            for (int i = 1; i < len; i++)
-                s->map[i] = len - i;
-        else
-            for (int i = 1; i < len; i++)
-                s->map[i] = i;
-    }
-
-    /* Our 15-point transform is actually a 5x3 PFA, so embed its input map. */
-    if (len == 15) {
-        int tmp[15];
-        memcpy(tmp, s->map, 15*sizeof(*tmp));
-        for (int i = 0; i < 5; i++) {
-            for (int j = 0; j < 3; j++)
-                s->map[i*3 + j] = tmp[(i*3 + j*5) % 15];
-        }
-    }
+    if (len == 15)
+        ret = ff_tx_gen_pfa_input_map(s, opts, 3, 5);
+    else if (flags & FF_TX_PRESHUFFLE)
+        ret = ff_tx_gen_default_map(s, opts);
 
-    return 0;
+    return ret;
 }
 
 #define DECL_FACTOR_S(n)                                                       \
@@ -605,7 +590,7 @@ static av_cold int TX_NAME(ff_tx_fft_sr_codelet_init)(AVTXContext *s,
                                                       const void *scale)
 {
     TX_TAB(ff_tx_init_tabs)(len);
-    return ff_tx_gen_ptwo_revtab(s, opts ? opts->invert_lookup : 1);
+    return ff_tx_gen_ptwo_revtab(s, opts);
 }
 
 #define DECL_SR_CODELET_DEF(n)                              \
@@ -742,7 +727,9 @@ static av_cold int TX_NAME(ff_tx_fft_init)(AVTXContext *s,
 {
     int ret;
     int is_inplace = !!(flags & AV_TX_INPLACE);
-    FFTXCodeletOptions sub_opts = { .invert_lookup = !is_inplace };
+    FFTXCodeletOptions sub_opts = {
+        .map_dir = is_inplace ? FF_TX_MAP_SCATTER : FF_TX_MAP_GATHER,
+    };
 
     flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
     flags |=  AV_TX_INPLACE;      /* in-place */
@@ -974,7 +961,9 @@ static av_cold int TX_NAME(ff_tx_fft_pfa_init)(AVTXContext *s,
                                 sub_len, inv, scale)))
         return ret;
 
-    if ((ret = ff_tx_gen_compound_mapping(s, cd->factors[0], sub_len)))
+    /* Generate PFA map */
+    if ((ret = ff_tx_gen_compound_mapping(s, opts, 0,
+                                          cd->factors[0], sub_len)))
         return ret;
 
     if (!(s->tmp = av_malloc(len*sizeof(*s->tmp))))
@@ -1128,7 +1117,9 @@ static av_cold int TX_NAME(ff_tx_mdct_init)(AVTXContext *s,
                                             const void *scale)
 {
     int ret;
-    FFTXCodeletOptions sub_opts = { .invert_lookup = inv };
+    FFTXCodeletOptions sub_opts = {
+        .map_dir = !inv ? FF_TX_MAP_SCATTER : FF_TX_MAP_GATHER,
+    };
 
     s->scale_d = *((SCALE_TYPE *)scale);
     s->scale_f = s->scale_d;
@@ -1328,7 +1319,7 @@ static av_cold int TX_NAME(ff_tx_mdct_pfa_init)(AVTXContext *s,
                                                 const void *scale)
 {
     int ret, sub_len;
-    FFTXCodeletOptions sub_opts = { .invert_lookup = 0 };
+    FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_SCATTER };
 
     len >>= 1;
     sub_len = len / cd->factors[0];
@@ -1344,9 +1335,13 @@ static av_cold int TX_NAME(ff_tx_mdct_pfa_init)(AVTXContext *s,
                                 sub_len, inv, scale)))
         return ret;
 
-    if ((ret = ff_tx_gen_compound_mapping(s, cd->factors[0], sub_len)))
+    if ((ret = ff_tx_gen_compound_mapping(s, opts, s->inv, cd->factors[0], sub_len)))
         return ret;
 
+    /* Our 15-point transform is also a compound one, so embed its input map */
+    if (cd->factors[0] == 15)
+        TX_EMBED_INPUT_PFA_MAP(s->map, len, 3, 5);
+
     if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s, inv ? s->map : NULL)))
         return ret;
 
diff --git a/libavutil/x86/tx_float_init.c b/libavutil/x86/tx_float_init.c
index 97ee44defa..d3c0beb50f 100644
--- a/libavutil/x86/tx_float_init.c
+++ b/libavutil/x86/tx_float_init.c
@@ -75,12 +75,11 @@ static av_cold int b ##basis## _i ##interleave(AVTXContext *s,                 \
                                                int len, int inv,               \
                                                const void *scale)              \
 {                                                                              \
-    const int inv_lookup = opts ? opts->invert_lookup : 1;                     \
     ff_tx_init_tabs_float(len);                                                \
     if (cd->max_len == 2)                                                      \
-        return ff_tx_gen_ptwo_revtab(s, inv_lookup);                           \
+        return ff_tx_gen_ptwo_revtab(s, opts);                                 \
     else                                                                       \
-        return ff_tx_gen_split_radix_parity_revtab(s, len, inv, inv_lookup,    \
+        return ff_tx_gen_split_radix_parity_revtab(s, len, inv, opts,          \
                                                    basis, interleave);         \
 }
 
@@ -91,27 +90,27 @@ static av_cold int factor_init(AVTXContext *s, const FFTXCodelet *cd,
                                uint64_t flags, FFTXCodeletOptions *opts,
                                int len, int inv, const void *scale)
 {
+    int ret;
+
+    /* The transformations below are performed in the gather domain,
+     * so override the option and let the infrastructure convert the map
+     * to SCATTER if needed. */
+    FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_GATHER };
+
     TX_TAB(ff_tx_init_tabs)(len);
 
-    s->map = av_malloc(len*sizeof(s->map));
-    s->map[0] = 0; /* DC is always at the start */
-    if (inv) /* Reversing the ACs flips the transform direction */
-        for (int i = 1; i < len; i++)
-            s->map[i] = len - i;
+    if (len == 15)
+        ret = ff_tx_gen_pfa_input_map(s, &sub_opts, 3, 5);
     else
-        for (int i = 1; i < len; i++)
-            s->map[i] = i;
+        ret = ff_tx_gen_default_map(s, &sub_opts);
+
+    if (ret < 0)
+        return ret;
 
     if (len == 15) {
         int cnt = 0, tmp[15];
 
-        /* Our 15-point transform is actually a 5x3 PFA, so embed its input map. */
-        memcpy(tmp, s->map, 15*sizeof(*tmp));
-        for (int i = 0; i < 5; i++)
-            for (int j = 0; j < 3; j++)
-                s->map[i*3 + j] = tmp[(i*3 + j*5) % 15];
-
-        /* Special 15-point assembly permutation */
+        /* Special permutation to simplify loads in the pre-permuted version */
         memcpy(tmp, s->map, 15*sizeof(*tmp));
         for (int i = 1; i < 15; i += 3) {
             s->map[cnt] = tmp[i];
@@ -139,7 +138,7 @@ static av_cold int m_inv_init(AVTXContext *s, const FFTXCodelet *cd,
                               int len, int inv, const void *scale)
 {
     int ret;
-    FFTXCodeletOptions sub_opts = { .invert_lookup = 1 };
+    FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_GATHER };
 
     s->scale_d = *((SCALE_TYPE *)scale);
     s->scale_f = s->scale_d;
@@ -177,7 +176,7 @@ static av_cold int fft_pfa_init(AVTXContext *s,
 {
     int ret;
     int sub_len = len / cd->factors[0];
-    FFTXCodeletOptions sub_opts = { .invert_lookup = 0 };
+    FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_SCATTER };
 
     flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
     flags |=  AV_TX_INPLACE;      /* in-place */
@@ -188,13 +187,18 @@ static av_cold int fft_pfa_init(AVTXContext *s,
                                 sub_len, inv, scale)))
         return ret;
 
-    if ((ret = ff_tx_gen_compound_mapping(s, cd->factors[0], sub_len)))
+    if ((ret = ff_tx_gen_compound_mapping(s, opts, s->inv, cd->factors[0], sub_len)))
         return ret;
 
     if (cd->factors[0] == 15) {
+        int tmp[15];
+
+        /* Our 15-point transform is also a compound one, so embed its input map */
+        TX_EMBED_INPUT_PFA_MAP(s->map, len, 3, 5);
+
+        /* Special permutation to simplify loads in the pre-permuted version */
         for (int k = 0; k < s->sub[0].len; k++) {
             int cnt = 0;
-            int tmp[15];
             memcpy(tmp, &s->map[k*15], 15*sizeof(*tmp));
             for (int i = 1; i < 15; i += 3) {
                 s->map[k*15 + cnt] = tmp[i];



More information about the ffmpeg-cvslog mailing list