[FFmpeg-cvslog] opus_pvq_search: split functions into exactness and only use the exact if its faster
Ivan Kalvachev
ikalvachev at gmail.com
Fri Aug 18 21:51:33 EEST 2017
I ask for revert, instead you commit more on it.
Ignoring everything I said on irc and in mail.
On 8/18/17, Rostislav Pehlivanov <git at videolan.org> wrote:
> ffmpeg | branch: master | Rostislav Pehlivanov <atomnuker at gmail.com> | Fri
> Aug 18 19:29:33 2017 +0100| [3c99523a2864af729a8576c3fffe81fb884fa0d5] |
> committer: Rostislav Pehlivanov
>
> opus_pvq_search: split functions into exactness and only use the exact if
> its faster
>
> This splits the asm function into exact and non-exact version. The exact
> version is as fast or faster on newer CPUs (which EXTERNAL_AVX_FAST
> describes
> well) whilst the non-exact version is faster than the exact on older CPUs.
>
> Also fixes yasm compilation which doesn't accept !cpuflags(avx) syntax.
>
> Signed-off-by: Rostislav Pehlivanov <atomnuker at gmail.com>
>
>> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=3c99523a2864af729a8576c3fffe81fb884fa0d5
> ---
>
> libavcodec/x86/opus_dsp_init.c | 14 +++++++-------
> libavcodec/x86/opus_pvq_search.asm | 34 +++++++++++++++++++++-------------
> 2 files changed, 28 insertions(+), 20 deletions(-)
>
> diff --git a/libavcodec/x86/opus_dsp_init.c b/libavcodec/x86/opus_dsp_init.c
> index c51f786ee8..a9f8a96159 100644
> --- a/libavcodec/x86/opus_dsp_init.c
> +++ b/libavcodec/x86/opus_dsp_init.c
> @@ -24,9 +24,9 @@
> #include "libavutil/x86/cpu.h"
> #include "libavcodec/opus_pvq.h"
>
> -extern float ff_pvq_search_sse2(float *X, int *y, int K, int N);
> -extern float ff_pvq_search_sse4(float *X, int *y, int K, int N);
> -extern float ff_pvq_search_avx (float *X, int *y, int K, int N);
> +extern float ff_pvq_search_approx_sse2(float *X, int *y, int K, int N);
> +extern float ff_pvq_search_approx_sse4(float *X, int *y, int K, int N);
> +extern float ff_pvq_search_exact_avx (float *X, int *y, int K, int N);
>
> av_cold void ff_opus_dsp_init_x86(CeltPVQ *s)
> {
> @@ -34,12 +34,12 @@ av_cold void ff_opus_dsp_init_x86(CeltPVQ *s)
>
> #if CONFIG_OPUS_ENCODER
> if (EXTERNAL_SSE2(cpu_flags))
> - s->pvq_search = ff_pvq_search_sse2;
> + s->pvq_search = ff_pvq_search_approx_sse2;
>
> if (EXTERNAL_SSE4(cpu_flags))
> - s->pvq_search = ff_pvq_search_sse4;
> + s->pvq_search = ff_pvq_search_approx_sse4;
>
> - if (EXTERNAL_AVX(cpu_flags))
> - s->pvq_search = ff_pvq_search_avx;
> + if (EXTERNAL_AVX_FAST(cpu_flags))
> + s->pvq_search = ff_pvq_search_exact_avx;
> #endif
> }
> diff --git a/libavcodec/x86/opus_pvq_search.asm
> b/libavcodec/x86/opus_pvq_search.asm
> index 2f4864c95c..8cf040465d 100644
> --- a/libavcodec/x86/opus_pvq_search.asm
> +++ b/libavcodec/x86/opus_pvq_search.asm
> @@ -82,7 +82,7 @@ SECTION .text
> %endif
> %endmacro
>
> -%macro PULSES_SEARCH 1
> +%macro PULSES_SEARCH 2 ; %1 - add or sub, %2 - use approximation
> ; m6 Syy_norm
> ; m7 Sxy_norm
> addps m6, mm_const_float_0_5 ; Syy_norm += 1.0/2
> @@ -96,7 +96,7 @@ align 16
> movaps m4, [tmpY + r4] ; y[i]
> movaps m5, [tmpX + r4] ; X[i]
>
> -%if !cpuflag(avx) ; for crappy ancient CPUs that have slow packed divs but
> fast 1/sqrt
> +%if %2
> xorps m0, m0
> cmpps m0, m0, m5, 4 ; m0 = (X[i] != 0.0)
> %endif
> @@ -104,7 +104,7 @@ align 16
> addps m4, m6 ; m4 = Syy_new = y[i] + Syy_norm
> addps m5, m7 ; m5 = Sxy_new = X[i] + Sxy_norm
>
> -%if !cpuflag(avx)
> +%if %2
> andps m5, m0 ; if(X[i] == 0) Sxy_new = 0; Prevent
> aproximation error from setting pulses in array padding.
> %endif
>
> @@ -119,7 +119,7 @@ align 16
> andps m5, m0 ; (0<y)?m5:0
> %endif
>
> -%if !cpuflag(avx)
> +%if %2
> rsqrtps m4, m4
> mulps m5, m4 ; m5 = p = Sxy_new*approx(1/sqrt(Syy) )
> %else
> @@ -211,8 +211,13 @@ align 16
> ; uint32 K - Number of pulses to have after quantizations.
> ; uint32 N - Number of vector elements. Must be 0 < N < 256
> ;
> -%macro PVQ_FAST_SEARCH 0
> -cglobal pvq_search, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N
> +%macro PVQ_FAST_SEARCH 1 ; %1 - use approximation
> +%if %1
> +cglobal pvq_search_approx, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N
> +%else
> +cglobal pvq_search_exact, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N
> +%endif
> +
> %define tmpX rsp
> %define tmpY outYq
>
> @@ -255,7 +260,7 @@ align 16
> jz %%zero_input ; if (Sx==0) goto zero_input
>
> cvtsi2ss xm0, dword Kd ; m0 = K
> -%if !cpuflag(avx)
> +%if %1
> rcpss xm1, xm1 ; m1 = approx(1/Sx)
> mulss xm0, xm1 ; m0 = K*(1/Sx)
> %else
> @@ -308,7 +313,7 @@ align 16
> align 16 ; K - pulses > 0
> %%add_pulses_loop:
>
> - PULSES_SEARCH add ; m6 Syy_norm ; m7 Sxy_norm
> + PULSES_SEARCH add, %1 ; m6 Syy_norm ; m7 Sxy_norm
>
> sub Kd, 1
> jnz %%add_pulses_loop
> @@ -320,7 +325,7 @@ align 16 ; K - pulses > 0
> align 16
> %%remove_pulses_loop:
>
> - PULSES_SEARCH sub ; m6 Syy_norm ; m7 Sxy_norm
> + PULSES_SEARCH sub, %1 ; m6 Syy_norm ; m7 Sxy_norm
>
> add Kd, 1
> jnz %%remove_pulses_loop
> @@ -367,12 +372,15 @@ align 16
> jmp %%return
> %endmacro
>
> -
> +; if 1, use a float op that give half precision but execute for around 3
> cycles.
> +; On Skylake & Ryzen the division is much faster (around 11c/3),
> +; that makes the full precision code about 2% slower.
> +; Opus also does use rsqrt approximation in their intrinsics code.
> INIT_XMM sse2
> -PVQ_FAST_SEARCH
> +PVQ_FAST_SEARCH 1
>
> INIT_XMM sse4
> -PVQ_FAST_SEARCH
> +PVQ_FAST_SEARCH 1
>
> INIT_XMM avx
> -PVQ_FAST_SEARCH
> +PVQ_FAST_SEARCH 0
I asked you to turn these into e.g. :
%define USE_APPROXIMATION 0
INIT_XMM avx
PVQ_FAST_SEARCH _exact
where:
%macro PVQ_FAST_SEARCH 1
cglobal pvq_search%1, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N
More information about the ffmpeg-cvslog
mailing list