[FFmpeg-cvslog] opus_pvq_search: split functions into exactness and only use the exact if its faster

Fri Aug 18 21:33:35 EEST 2017

ffmpeg | branch: master | Rostislav Pehlivanov <atomnuker at gmail.com> | Fri Aug 18 19:29:33 2017 +0100| [3c99523a2864af729a8576c3fffe81fb884fa0d5] | committer: Rostislav Pehlivanov

opus_pvq_search: split functions into exactness and only use the exact if its faster

This splits the asm function into exact and non-exact version. The exact
version is as fast or faster on newer CPUs (which EXTERNAL_AVX_FAST describes
well) whilst the non-exact version is faster than the exact on older CPUs.

Also fixes yasm compilation which doesn't accept !cpuflags(avx) syntax.

Signed-off-by: Rostislav Pehlivanov <atomnuker at gmail.com>

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=3c99523a2864af729a8576c3fffe81fb884fa0d5
---

 libavcodec/x86/opus_dsp_init.c     | 14 +++++++-------
 libavcodec/x86/opus_pvq_search.asm | 34 +++++++++++++++++++++-------------
 2 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/libavcodec/x86/opus_dsp_init.c b/libavcodec/x86/opus_dsp_init.c
index c51f786ee8..a9f8a96159 100644
--- a/libavcodec/x86/opus_dsp_init.c
+++ b/libavcodec/x86/opus_dsp_init.c
@@ -24,9 +24,9 @@
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/opus_pvq.h"
 
-extern float ff_pvq_search_sse2(float *X, int *y, int K, int N);
-extern float ff_pvq_search_sse4(float *X, int *y, int K, int N);
-extern float ff_pvq_search_avx (float *X, int *y, int K, int N);
+extern float ff_pvq_search_approx_sse2(float *X, int *y, int K, int N);
+extern float ff_pvq_search_approx_sse4(float *X, int *y, int K, int N);
+extern float ff_pvq_search_exact_avx  (float *X, int *y, int K, int N);
 
 av_cold void ff_opus_dsp_init_x86(CeltPVQ *s)
 {
@@ -34,12 +34,12 @@ av_cold void ff_opus_dsp_init_x86(CeltPVQ *s)
 
 #if CONFIG_OPUS_ENCODER
     if (EXTERNAL_SSE2(cpu_flags))
-        s->pvq_search = ff_pvq_search_sse2;
+        s->pvq_search = ff_pvq_search_approx_sse2;
 
     if (EXTERNAL_SSE4(cpu_flags))
-        s->pvq_search = ff_pvq_search_sse4;
+        s->pvq_search = ff_pvq_search_approx_sse4;
 
-    if (EXTERNAL_AVX(cpu_flags))
-        s->pvq_search = ff_pvq_search_avx;
+    if (EXTERNAL_AVX_FAST(cpu_flags))
+        s->pvq_search = ff_pvq_search_exact_avx;
 #endif
 }
diff --git a/libavcodec/x86/opus_pvq_search.asm b/libavcodec/x86/opus_pvq_search.asm
index 2f4864c95c..8cf040465d 100644
--- a/libavcodec/x86/opus_pvq_search.asm
+++ b/libavcodec/x86/opus_pvq_search.asm
@@ -82,7 +82,7 @@ SECTION .text
 %endif
 %endmacro
 
-%macro PULSES_SEARCH 1
+%macro PULSES_SEARCH 2 ; %1 - add or sub, %2 - use approximation
 ; m6 Syy_norm
 ; m7 Sxy_norm
     addps          m6, mm_const_float_0_5   ; Syy_norm += 1.0/2
@@ -96,7 +96,7 @@ align 16
     movaps         m4, [tmpY + r4]  ; y[i]
     movaps         m5, [tmpX + r4]  ; X[i]
 
-%if !cpuflag(avx) ; for crappy ancient CPUs that have slow packed divs but fast 1/sqrt
+%if %2
     xorps          m0, m0
     cmpps          m0, m0, m5, 4    ; m0 = (X[i] != 0.0)
 %endif
@@ -104,7 +104,7 @@ align 16
     addps          m4, m6           ; m4 = Syy_new = y[i] + Syy_norm
     addps          m5, m7           ; m5 = Sxy_new = X[i] + Sxy_norm
 
-%if !cpuflag(avx)
+%if %2
     andps          m5, m0           ; if(X[i] == 0) Sxy_new = 0; Prevent aproximation error from setting pulses in array padding.
 %endif
 
@@ -119,7 +119,7 @@ align 16
     andps          m5, m0               ; (0<y)?m5:0
 %endif
 
-%if !cpuflag(avx)
+%if %2
     rsqrtps        m4, m4
     mulps          m5, m4           ; m5 = p = Sxy_new*approx(1/sqrt(Syy) )
 %else
@@ -211,8 +211,13 @@ align 16
 ; uint32 K      - Number of pulses to have after quantizations.
 ; uint32 N      - Number of vector elements. Must be 0 < N < 256
 ;
-%macro PVQ_FAST_SEARCH 0
-cglobal pvq_search, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N
+%macro PVQ_FAST_SEARCH 1 ; %1 - use approximation
+%if %1
+cglobal pvq_search_approx, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N
+%else
+cglobal pvq_search_exact,  4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N
+%endif
+
 %define tmpX rsp
 %define tmpY outYq
 
@@ -255,7 +260,7 @@ align 16
     jz   %%zero_input       ; if (Sx==0) goto zero_input
 
     cvtsi2ss  xm0, dword Kd ; m0 = K
-%if !cpuflag(avx)
+%if %1
     rcpss     xm1, xm1      ; m1 = approx(1/Sx)
     mulss     xm0, xm1      ; m0 = K*(1/Sx)
 %else
@@ -308,7 +313,7 @@ align 16
 align 16                        ; K - pulses > 0
 %%add_pulses_loop:
 
-    PULSES_SEARCH add   ; m6 Syy_norm ; m7 Sxy_norm
+    PULSES_SEARCH add, %1   ; m6 Syy_norm ; m7 Sxy_norm
 
     sub        Kd, 1
     jnz  %%add_pulses_loop
@@ -320,7 +325,7 @@ align 16                        ; K - pulses > 0
 align 16
 %%remove_pulses_loop:
 
-    PULSES_SEARCH sub   ; m6 Syy_norm ; m7 Sxy_norm
+    PULSES_SEARCH sub, %1   ; m6 Syy_norm ; m7 Sxy_norm
 
     add        Kd, 1
     jnz  %%remove_pulses_loop
@@ -367,12 +372,15 @@ align 16
     jmp  %%return
 %endmacro
 
-
+; if 1, use a float op that give half precision but execute for around 3 cycles.
+; On Skylake & Ryzen the division is much faster (around 11c/3),
+; that makes the full precision code about 2% slower.
+; Opus also does use rsqrt approximation in their intrinsics code.
 INIT_XMM sse2
-PVQ_FAST_SEARCH
+PVQ_FAST_SEARCH 1
 
 INIT_XMM sse4
-PVQ_FAST_SEARCH
+PVQ_FAST_SEARCH 1
 
 INIT_XMM avx
-PVQ_FAST_SEARCH
+PVQ_FAST_SEARCH 0