[FFmpeg-devel] [PATCH v2 2/3] avfilter/x86/vf_exposure: add ff_exposure_avx2
James Almer
jamrial at gmail.com
Sat Nov 20 18:53:42 EET 2021
On 11/4/2021 1:18 AM, Wu Jianhua wrote:
> Performance data(Less is better):
> exposure_sse: 500491
You reported a better result in the first patch.
> exposure_avx2: 449122
This looks like a really low speed up for a function that processes
twice the amount of floats per loop.
>
> Signed-off-by: Wu Jianhua <jianhua.wu at intel.com>
> ---
> libavfilter/x86/vf_exposure.asm | 15 +++++++++++++++
> libavfilter/x86/vf_exposure_init.c | 6 ++++++
> 2 files changed, 21 insertions(+)
>
> diff --git a/libavfilter/x86/vf_exposure.asm b/libavfilter/x86/vf_exposure.asm
> index 3351c6fb3b..f271167805 100644
> --- a/libavfilter/x86/vf_exposure.asm
> +++ b/libavfilter/x86/vf_exposure.asm
> @@ -36,11 +36,21 @@ cglobal exposure, 2, 2, 4, ptr, length, black, scale
> VBROADCASTSS m1, xmm1
> %endif
>
> +%if cpuflag(fma3) || cpuflag(fma4)
Remove the fma4 check if you're not using it.
> + mulps m0, m0, m1 ; black * scale
> +%endif
> +
> .loop:
> +%if cpuflag(fma3) || cpuflag(fma4)
> + mova m2, m0
> + vfmsub231ps m2, m1, [ptrq]
> + movu [ptrq], m2
Have you tried to not use FMA for this and just keep the sub + mul even
for AVX2 and see how it performs?
> +%else
> movu m2, [ptrq]
> subps m2, m2, m0
> mulps m2, m2, m1
> movu [ptrq], m2
> +%endif
> add ptrq, mmsize
> sub lengthq, mmsize/4
>
> @@ -52,4 +62,9 @@ cglobal exposure, 2, 2, 4, ptr, length, black, scale
> %if ARCH_X86_64
> INIT_XMM sse
> EXPOSURE
> +
> +%if HAVE_AVX2_EXTERNAL
> +INIT_YMM avx2
> +EXPOSURE
> +%endif
> %endif
> diff --git a/libavfilter/x86/vf_exposure_init.c b/libavfilter/x86/vf_exposure_init.c
> index de1b360f6c..80dae6164e 100644
> --- a/libavfilter/x86/vf_exposure_init.c
> +++ b/libavfilter/x86/vf_exposure_init.c
> @@ -24,6 +24,7 @@
> #include "libavfilter/exposure.h"
>
> void ff_exposure_sse(float *ptr, int length, float black, float scale);
> +void ff_exposure_avx2(float *ptr, int length, float black, float scale);
>
> av_cold void ff_exposure_init_x86(ExposureContext *s)
> {
> @@ -32,5 +33,10 @@ av_cold void ff_exposure_init_x86(ExposureContext *s)
> #if ARCH_X86_64
> if (EXTERNAL_SSE(cpu_flags))
> s->exposure_func = ff_exposure_sse;
> +
> +#if HAVE_AVX2_EXTERNAL
No need for this preprocessor check.
> + if (EXTERNAL_AVX2_FAST(cpu_flags))
> + s->exposure_func = ff_exposure_avx2;
> +#endif
> #endif
> }
>
More information about the ffmpeg-devel
mailing list