[FFmpeg-cvslog] x86/tx_float: replace fft_sr_avx with fft_sr_fma3

Lynne git at videolan.org
Sat May 21 03:14:04 EEST 2022


ffmpeg | branch: master | Lynne <dev at lynne.ee> | Sat May 21 01:39:12 2022 +0200| [27cffd16aadd04bf2311e3114b8aa81e3732552f] | committer: Lynne

x86/tx_float: replace fft_sr_avx with fft_sr_fma3

When the SLOW_GATHER flag was added to the AVX2 version, this
made FMA3-features not enabled on Zen CPUs.
As FMA3 adds 6-7% across all platforms that support it, in
the interest of saving space, this commit removes the AVX
version and replaces it with an FMA3 version.
The only CPUs affected are Sandy Bridge and Bulldozer, which
have AVX support, but no FMA3 support.
In the future, if there's a demand for it, a version of the
function duplicated for AVX can be added.

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=27cffd16aadd04bf2311e3114b8aa81e3732552f
---

 libavutil/x86/tx_float.asm    |  4 ++--
 libavutil/x86/tx_float_init.c | 14 +++++++-------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/libavutil/x86/tx_float.asm b/libavutil/x86/tx_float.asm
index 511d8d6fa3..21f99d3945 100644
--- a/libavutil/x86/tx_float.asm
+++ b/libavutil/x86/tx_float.asm
@@ -1285,8 +1285,8 @@ FFT_SPLIT_RADIX_DEF 131072
 %endmacro
 
 %if ARCH_X86_64
-FFT_SPLIT_RADIX_FN avx,  float,    0
-FFT_SPLIT_RADIX_FN avx,  ns_float, 1
+FFT_SPLIT_RADIX_FN fma3, float,    0
+FFT_SPLIT_RADIX_FN fma3, ns_float, 1
 %if HAVE_AVX2_EXTERNAL
 FFT_SPLIT_RADIX_FN avx2, float,    0
 FFT_SPLIT_RADIX_FN avx2, ns_float, 1
diff --git a/libavutil/x86/tx_float_init.c b/libavutil/x86/tx_float_init.c
index 108f9b4b04..5db0b57d13 100644
--- a/libavutil/x86/tx_float_init.c
+++ b/libavutil/x86/tx_float_init.c
@@ -38,8 +38,8 @@ TX_DECL_FN(fft32,     avx)
 TX_DECL_FN(fft32_ns,  avx)
 TX_DECL_FN(fft32,     fma3)
 TX_DECL_FN(fft32_ns,  fma3)
-TX_DECL_FN(fft_sr,    avx)
-TX_DECL_FN(fft_sr_ns, avx)
+TX_DECL_FN(fft_sr,    fma3)
+TX_DECL_FN(fft_sr_ns, fma3)
 TX_DECL_FN(fft_sr,    avx2)
 TX_DECL_FN(fft_sr_ns, avx2)
 
@@ -88,13 +88,13 @@ const FFTXCodelet * const ff_tx_codelet_list_float_x86[] = {
     TX_DEF(fft32,    FFT, 32, 32, 2, 0, 288, b8_i2, fma3, FMA3, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW),
     TX_DEF(fft32_ns, FFT, 32, 32, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
            AV_CPU_FLAG_AVXSLOW),
-#if HAVE_AVX2_EXTERNAL
-    TX_DEF(fft_sr,    FFT, 64, 131072, 2, 0, 256, b8_i2, avx,  AVX,  0, AV_CPU_FLAG_AVXSLOW),
-    TX_DEF(fft_sr_ns, FFT, 64, 131072, 2, 0, 320, b8_i2, avx,  AVX,  AV_TX_INPLACE | FF_TX_PRESHUFFLE,
+    TX_DEF(fft_sr,    FFT, 64, 131072, 2, 0, 288, b8_i2, fma3,  FMA3,  0, AV_CPU_FLAG_AVXSLOW),
+    TX_DEF(fft_sr_ns, FFT, 64, 131072, 2, 0, 352, b8_i2, fma3,  FMA3,  AV_TX_INPLACE | FF_TX_PRESHUFFLE,
            AV_CPU_FLAG_AVXSLOW),
-    TX_DEF(fft_sr,    FFT, 64, 131072, 2, 0, 288, b8_i2, avx2, AVX2, 0,
+#if HAVE_AVX2_EXTERNAL
+    TX_DEF(fft_sr,    FFT, 64, 131072, 2, 0, 320, b8_i2, avx2, AVX2, 0,
            AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER),
-    TX_DEF(fft_sr_ns, FFT, 64, 131072, 2, 0, 352, b8_i2, avx2, AVX2, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
+    TX_DEF(fft_sr_ns, FFT, 64, 131072, 2, 0, 384, b8_i2, avx2, AVX2, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
            AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER),
 #endif
 #endif



More information about the ffmpeg-cvslog mailing list