[FFmpeg-devel] [PATCH v2 2/3] avfilter/x86/vf_exposure: add ff_exposure_avx2

James Almer jamrial at gmail.com
Sat Nov 20 18:53:42 EET 2021


On 11/4/2021 1:18 AM, Wu Jianhua wrote:
> Performance data(Less is better):
>      exposure_sse:   500491

You reported a better result in the first patch.

>      exposure_avx2:  449122

This looks like a really low speed up for a function that processes 
twice the amount of floats per loop.

> 
> Signed-off-by: Wu Jianhua <jianhua.wu at intel.com>
> ---
>   libavfilter/x86/vf_exposure.asm    | 15 +++++++++++++++
>   libavfilter/x86/vf_exposure_init.c |  6 ++++++
>   2 files changed, 21 insertions(+)
> 
> diff --git a/libavfilter/x86/vf_exposure.asm b/libavfilter/x86/vf_exposure.asm
> index 3351c6fb3b..f271167805 100644
> --- a/libavfilter/x86/vf_exposure.asm
> +++ b/libavfilter/x86/vf_exposure.asm
> @@ -36,11 +36,21 @@ cglobal exposure, 2, 2, 4, ptr, length, black, scale
>       VBROADCASTSS m1, xmm1
>   %endif
>   
> +%if cpuflag(fma3) || cpuflag(fma4)

Remove the fma4 check if you're not using it.

> +    mulps       m0, m0, m1 ; black * scale
> +%endif
> +
>   .loop:
> +%if cpuflag(fma3) || cpuflag(fma4)
> +    mova        m2, m0
> +    vfmsub231ps m2, m1, [ptrq]
> +    movu    [ptrq], m2

Have you tried to not use FMA for this and just keep the sub + mul even 
for AVX2 and see how it performs?

> +%else
>       movu        m2, [ptrq]
>       subps       m2, m2, m0
>       mulps       m2, m2, m1
>       movu    [ptrq], m2
> +%endif
>       add       ptrq, mmsize
>       sub    lengthq, mmsize/4
>   
> @@ -52,4 +62,9 @@ cglobal exposure, 2, 2, 4, ptr, length, black, scale
>   %if ARCH_X86_64
>   INIT_XMM sse
>   EXPOSURE
> +
> +%if HAVE_AVX2_EXTERNAL
> +INIT_YMM avx2
> +EXPOSURE
> +%endif
>   %endif
> diff --git a/libavfilter/x86/vf_exposure_init.c b/libavfilter/x86/vf_exposure_init.c
> index de1b360f6c..80dae6164e 100644
> --- a/libavfilter/x86/vf_exposure_init.c
> +++ b/libavfilter/x86/vf_exposure_init.c
> @@ -24,6 +24,7 @@
>   #include "libavfilter/exposure.h"
>   
>   void ff_exposure_sse(float *ptr, int length, float black, float scale);
> +void ff_exposure_avx2(float *ptr, int length, float black, float scale);
>   
>   av_cold void ff_exposure_init_x86(ExposureContext *s)
>   {
> @@ -32,5 +33,10 @@ av_cold void ff_exposure_init_x86(ExposureContext *s)
>   #if ARCH_X86_64
>       if (EXTERNAL_SSE(cpu_flags))
>           s->exposure_func = ff_exposure_sse;
> +
> +#if HAVE_AVX2_EXTERNAL

No need for this preprocessor check.

> +    if (EXTERNAL_AVX2_FAST(cpu_flags))
> +        s->exposure_func = ff_exposure_avx2;
> +#endif
>   #endif
>   }
> 



More information about the ffmpeg-devel mailing list