[FFmpeg-devel] [PATCH] avutil/x86/float_dsp: add fma3 for scalarproduct
James Almer
jamrial at gmail.com
Thu Jan 21 05:33:32 EET 2021
On 1/20/2021 5:30 PM, Paul B Mahol wrote:
> Signed-off-by: Paul B Mahol <onemda at gmail.com>
> ---
> libavutil/x86/float_dsp.asm | 112 +++++++++++++++++++++++++++++++++
> libavutil/x86/float_dsp_init.c | 2 +
> 2 files changed, 114 insertions(+)
>
> diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm
> index 517fd63638..f7497df34e 100644
> --- a/libavutil/x86/float_dsp.asm
> +++ b/libavutil/x86/float_dsp.asm
> @@ -463,6 +463,118 @@ cglobal scalarproduct_float, 3,3,2, v1, v2, offset
> %endif
> RET
>
> +INIT_YMM fma3
> +cglobal scalarproduct_float, 3,5,8, v1, v2, size, len, offset
> + xor offsetq, offsetq
> + xorps m0, m0
> + shl sized, 2
> + mov lenq, sizeq
> + cmp lenq, 32
> + jl .l16
> + cmp lenq, 64
> + jl .l32
> + cmp lenq, 128
> + jl .l64
> + and lenq, ~127
> + xorps m1, m1
> + xorps m2, m2
> + xorps m3, m3
> +.loop128:
> + movups m4, [v1q+offsetq]
> + movups m5, [v1q+offsetq + 32]
> + movups m6, [v1q+offsetq + 64]
> + movups m7, [v1q+offsetq + 96]
> + fmaddps m0, m4, [v2q+offsetq ], m0
> + fmaddps m1, m5, [v2q+offsetq + 32], m1
> + fmaddps m2, m6, [v2q+offsetq + 64], m2
> + fmaddps m3, m7, [v2q+offsetq + 96], m3
Could use mmsize for the offsets. It will make it easier to adapt this
function to eventually use avx512.
> + add offsetq, 128
> + cmp offsetq, lenq
> + jl .loop128
> + addps m0, m1
> + addps m2, m3
> + addps m0, m2
Do only
addps m0, m2
addps m1, m3
here. There's no need to combine all four regs into m0 if you end up
jumping to l64, since m1 is still used there as an accumulator.
> + mov lenq, sizeq
> + and lenq, 127
> + cmp lenq, 64
> + jge .l64
Then add
addps m0, m1
After this line, since l32 and l16 use only m0.
> + cmp lenq, 32
> + jge .l32
> + cmp lenq, 16
> + jge .l16
Move the next two instructions before this line. If you jump to l16 like
this, vextractf128 will not be executed and you'll be missing the upper
128 bits of m0 in the final sum.
> + vextractf128 xmm2, m0, 1
> + addps xmm0, xmm2
> + movhlps xmm1, xmm0
> + addps xmm0, xmm1
> + movss xmm1, xmm0
> + shufps xmm0, xmm0, 1
> + addss xmm0, xmm1
> + RET
> +.l64:
> + and lenq, ~63
> + add lenq, offsetq
> + xorps m1, m1
> +.loop64:
> + movups m4, [v1q+offsetq]
> + movups m5, [v1q+offsetq + 32]
> + fmaddps m0, m4, [v2q+offsetq], m0
> + fmaddps m1, m5, [v2q+offsetq + 32], m1
> + add offsetq, 64
> + cmp offsetq, lenq
> + jl .loop64
> + addps m0, m1
> + mov lenq, sizeq
> + and lenq, 63
> + cmp lenq, 32
> + jge .l32
> + cmp lenq, 16
> + jge .l16
Ditto.
> + vextractf128 xmm2, m0, 1
> + addps xmm0, xmm2
> + movhlps xmm1, xmm0
> + addps xmm0, xmm1
> + movss xmm1, xmm0
> + shufps xmm0, xmm0, 1
> + addss xmm0, xmm1
> + RET
> +.l32:
> + and lenq, ~31
> + add lenq, offsetq
> +.loop32:
> + movups m4, [v1q+offsetq]
> + fmaddps m0, m4, [v2q+offsetq], m0
> + add offsetq, 32
> + cmp offsetq, lenq
> + jl .loop32
> + vextractf128 xmm2, m0, 1
> + addps xmm0, xmm2
> + mov lenq, sizeq
> + and lenq, 31
> + cmp lenq, 16
> + jge .l16
You got it right here.
> + movhlps xmm1, xmm0
> + addps xmm0, xmm1
> + movss xmm1, xmm0
> + shufps xmm0, xmm0, 1
> + addss xmm0, xmm1
> + RET
> +.l16:
> + and lenq, ~15
> + add lenq, offsetq
> +.loop16:
> + movaps xmm1, [v1q+offsetq]
> + mulps xmm1, [v2q+offsetq]
> + addps xmm0, xmm1
> + add offsetq, 16
> + cmp offsetq, lenq
> + jl .loop16
> + movhlps xmm1, xmm0
> + addps xmm0, xmm1
> + movss xmm1, xmm0
> + shufps xmm0, xmm0, 1
> + addss xmm0, xmm1
> + RET
%if ARCH_X86_64 == 0
movss r0m, xm0
fld dword r0m
%endif
Above every RET in the function.
> +
> ;-----------------------------------------------------------------------------
> ; void ff_butterflies_float(float *src0, float *src1, int len);
> ;-----------------------------------------------------------------------------
> diff --git a/libavutil/x86/float_dsp_init.c b/libavutil/x86/float_dsp_init.c
> index 8826e4e2c9..67bfbe18d0 100644
> --- a/libavutil/x86/float_dsp_init.c
> +++ b/libavutil/x86/float_dsp_init.c
> @@ -76,6 +76,7 @@ void ff_vector_fmul_reverse_avx2(float *dst, const float *src0,
> const float *src1, int len);
>
> float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
> +float ff_scalarproduct_float_fma3(const float *v1, const float *v2, int order);
>
> void ff_butterflies_float_sse(float *av_restrict src0, float *av_restrict src1, int len);
>
> @@ -117,5 +118,6 @@ av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
> fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_fma3;
> fdsp->vector_fmul_add = ff_vector_fmul_add_fma3;
> fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_fma3;
> + fdsp->scalarproduct_float = ff_scalarproduct_float_fma3;
> }
> }
>
More information about the ffmpeg-devel
mailing list