[FFmpeg-cvslog] opus_pvq_search: split functions into exactness and only use the exact if its faster

Fri Aug 18 21:51:33 EEST 2017

I ask for revert, instead you commit more on it.
Ignoring everything I said on irc and in mail.

On 8/18/17, Rostislav Pehlivanov <git at videolan.org> wrote:
> ffmpeg | branch: master | Rostislav Pehlivanov <atomnuker at gmail.com> | Fri
> Aug 18 19:29:33 2017 +0100| [3c99523a2864af729a8576c3fffe81fb884fa0d5] |
> committer: Rostislav Pehlivanov
>
> opus_pvq_search: split functions into exactness and only use the exact if
> its faster
>
> This splits the asm function into exact and non-exact version. The exact
> version is as fast or faster on newer CPUs (which EXTERNAL_AVX_FAST
> describes
> well) whilst the non-exact version is faster than the exact on older CPUs.
>
> Also fixes yasm compilation which doesn't accept !cpuflags(avx) syntax.
>
> Signed-off-by: Rostislav Pehlivanov <atomnuker at gmail.com>
>
>> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=3c99523a2864af729a8576c3fffe81fb884fa0d5
> ---
>
>  libavcodec/x86/opus_dsp_init.c     | 14 +++++++-------
>  libavcodec/x86/opus_pvq_search.asm | 34 +++++++++++++++++++++-------------
>  2 files changed, 28 insertions(+), 20 deletions(-)
>
> diff --git a/libavcodec/x86/opus_dsp_init.c b/libavcodec/x86/opus_dsp_init.c
> index c51f786ee8..a9f8a96159 100644
> --- a/libavcodec/x86/opus_dsp_init.c
> +++ b/libavcodec/x86/opus_dsp_init.c
> @@ -24,9 +24,9 @@
>  #include "libavutil/x86/cpu.h"
>  #include "libavcodec/opus_pvq.h"
>
> -extern float ff_pvq_search_sse2(float *X, int *y, int K, int N);
> -extern float ff_pvq_search_sse4(float *X, int *y, int K, int N);
> -extern float ff_pvq_search_avx (float *X, int *y, int K, int N);
> +extern float ff_pvq_search_approx_sse2(float *X, int *y, int K, int N);
> +extern float ff_pvq_search_approx_sse4(float *X, int *y, int K, int N);
> +extern float ff_pvq_search_exact_avx  (float *X, int *y, int K, int N);
>
>  av_cold void ff_opus_dsp_init_x86(CeltPVQ *s)
>  {
> @@ -34,12 +34,12 @@ av_cold void ff_opus_dsp_init_x86(CeltPVQ *s)
>
>  #if CONFIG_OPUS_ENCODER
>      if (EXTERNAL_SSE2(cpu_flags))
> -        s->pvq_search = ff_pvq_search_sse2;
> +        s->pvq_search = ff_pvq_search_approx_sse2;
>
>      if (EXTERNAL_SSE4(cpu_flags))
> -        s->pvq_search = ff_pvq_search_sse4;
> +        s->pvq_search = ff_pvq_search_approx_sse4;
>
> -    if (EXTERNAL_AVX(cpu_flags))
> -        s->pvq_search = ff_pvq_search_avx;
> +    if (EXTERNAL_AVX_FAST(cpu_flags))
> +        s->pvq_search = ff_pvq_search_exact_avx;
>  #endif
>  }
> diff --git a/libavcodec/x86/opus_pvq_search.asm
> b/libavcodec/x86/opus_pvq_search.asm
> index 2f4864c95c..8cf040465d 100644
> --- a/libavcodec/x86/opus_pvq_search.asm
> +++ b/libavcodec/x86/opus_pvq_search.asm
> @@ -82,7 +82,7 @@ SECTION .text
>  %endif
>  %endmacro
>
> -%macro PULSES_SEARCH 1
> +%macro PULSES_SEARCH 2 ; %1 - add or sub, %2 - use approximation
>  ; m6 Syy_norm
>  ; m7 Sxy_norm
>      addps          m6, mm_const_float_0_5   ; Syy_norm += 1.0/2
> @@ -96,7 +96,7 @@ align 16
>      movaps         m4, [tmpY + r4]  ; y[i]
>      movaps         m5, [tmpX + r4]  ; X[i]
>
> -%if !cpuflag(avx) ; for crappy ancient CPUs that have slow packed divs but
> fast 1/sqrt
> +%if %2
>      xorps          m0, m0
>      cmpps          m0, m0, m5, 4    ; m0 = (X[i] != 0.0)
>  %endif
> @@ -104,7 +104,7 @@ align 16
>      addps          m4, m6           ; m4 = Syy_new = y[i] + Syy_norm
>      addps          m5, m7           ; m5 = Sxy_new = X[i] + Sxy_norm
>
> -%if !cpuflag(avx)
> +%if %2
>      andps          m5, m0           ; if(X[i] == 0) Sxy_new = 0; Prevent
> aproximation error from setting pulses in array padding.
>  %endif
>
> @@ -119,7 +119,7 @@ align 16
>      andps          m5, m0               ; (0<y)?m5:0
>  %endif
>
> -%if !cpuflag(avx)
> +%if %2
>      rsqrtps        m4, m4
>      mulps          m5, m4           ; m5 = p = Sxy_new*approx(1/sqrt(Syy) )
>  %else
> @@ -211,8 +211,13 @@ align 16
>  ; uint32 K      - Number of pulses to have after quantizations.
>  ; uint32 N      - Number of vector elements. Must be 0 < N < 256
>  ;
> -%macro PVQ_FAST_SEARCH 0
> -cglobal pvq_search, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N
> +%macro PVQ_FAST_SEARCH 1 ; %1 - use approximation
> +%if %1
> +cglobal pvq_search_approx, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N
> +%else
> +cglobal pvq_search_exact,  4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N
> +%endif
> +
>  %define tmpX rsp
>  %define tmpY outYq
>
> @@ -255,7 +260,7 @@ align 16
>      jz   %%zero_input       ; if (Sx==0) goto zero_input
>
>      cvtsi2ss  xm0, dword Kd ; m0 = K
> -%if !cpuflag(avx)
> +%if %1
>      rcpss     xm1, xm1      ; m1 = approx(1/Sx)
>      mulss     xm0, xm1      ; m0 = K*(1/Sx)
>  %else
> @@ -308,7 +313,7 @@ align 16
>  align 16                        ; K - pulses > 0
>  %%add_pulses_loop:
>
> -    PULSES_SEARCH add   ; m6 Syy_norm ; m7 Sxy_norm
> +    PULSES_SEARCH add, %1   ; m6 Syy_norm ; m7 Sxy_norm
>
>      sub        Kd, 1
>      jnz  %%add_pulses_loop
> @@ -320,7 +325,7 @@ align 16                        ; K - pulses > 0
>  align 16
>  %%remove_pulses_loop:
>
> -    PULSES_SEARCH sub   ; m6 Syy_norm ; m7 Sxy_norm
> +    PULSES_SEARCH sub, %1   ; m6 Syy_norm ; m7 Sxy_norm
>
>      add        Kd, 1
>      jnz  %%remove_pulses_loop
> @@ -367,12 +372,15 @@ align 16
>      jmp  %%return
>  %endmacro
>
> -
> +; if 1, use a float op that give half precision but execute for around 3
> cycles.
> +; On Skylake & Ryzen the division is much faster (around 11c/3),
> +; that makes the full precision code about 2% slower.
> +; Opus also does use rsqrt approximation in their intrinsics code.
>  INIT_XMM sse2
> -PVQ_FAST_SEARCH
> +PVQ_FAST_SEARCH 1
>
>  INIT_XMM sse4
> -PVQ_FAST_SEARCH
> +PVQ_FAST_SEARCH 1
>
>  INIT_XMM avx
> -PVQ_FAST_SEARCH
> +PVQ_FAST_SEARCH 0

I asked you to turn these into e.g. :

  %define USE_APPROXIMATION 0
  INIT_XMM avx
  PVQ_FAST_SEARCH _exact

where:

  %macro PVQ_FAST_SEARCH 1
  cglobal pvq_search%1, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N