[FFmpeg-devel] [PATCH 2/3] x86/float_dsp: unroll loop in vector_fmac_scalar
Christophe Gisquet
christophe.gisquet at gmail.com
Wed Apr 16 12:06:04 CEST 2014
Hi,
> ~6% faster SSE2 performance. AVX/FMA3 are unaffected.
What CPU, environment and test case have you used?
For SSE2, if I'm not mistaken, the difference in the code is having
different regs used in the unrolled part. When I tested that with AAC,
which often performs calls for 64 elements, this was not a win for mingw64.
But a 6% win for most typical systems is certainly better than a <1% loss
for a few. I'm OK with the change otherwise.
Best regards,
Christophe
> Signed-off-by: James Almer <jamrial at gmail.com>
> ---
> libavutil/x86/float_dsp.asm | 44
++++++++++++++++++++++++++------------------
> 1 file changed, 26 insertions(+), 18 deletions(-)
>
> diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm
> index 01ac60e..8d236ef 100644
> --- a/libavutil/x86/float_dsp.asm
> +++ b/libavutil/x86/float_dsp.asm
> @@ -61,9 +61,9 @@ VECTOR_FMUL
>
> %macro VECTOR_FMAC_SCALAR 0
> %if UNIX64
> -cglobal vector_fmac_scalar, 3,3,3, dst, src, len
> +cglobal vector_fmac_scalar, 3,3,5, dst, src, len
> %else
> -cglobal vector_fmac_scalar, 4,4,3, dst, src, mul, len
> +cglobal vector_fmac_scalar, 4,4,5, dst, src, mul, len
> %endif
> %if ARCH_X86_32
> VBROADCASTSS m0, mulm
> @@ -78,23 +78,31 @@ cglobal vector_fmac_scalar, 4,4,3, dst, src, mul, len
> %endif
> lea lenq, [lend*4-64]
> .loop:
> -%assign a 0
> -%rep 32/mmsize
> %if cpuflag(fma3)
> - mova m1, [dstq+lenq+(a+0)*mmsize]
> - mova m2, [dstq+lenq+(a+1)*mmsize]
> - fmaddps m1, m0, [srcq+lenq+(a+0)*mmsize], m1
> - fmaddps m2, m0, [srcq+lenq+(a+1)*mmsize], m2
> -%else
> - mulps m1, m0, [srcq+lenq+(a+0)*mmsize]
> - mulps m2, m0, [srcq+lenq+(a+1)*mmsize]
> - addps m1, m1, [dstq+lenq+(a+0)*mmsize]
> - addps m2, m2, [dstq+lenq+(a+1)*mmsize]
> -%endif
> - mova [dstq+lenq+(a+0)*mmsize], m1
> - mova [dstq+lenq+(a+1)*mmsize], m2
> -%assign a a+2
> -%endrep
> + mova m1, [dstq+lenq]
> + mova m2, [dstq+lenq+1*mmsize]
> + fmaddps m1, m0, [srcq+lenq], m1
> + fmaddps m2, m0, [srcq+lenq+1*mmsize], m2
> +%else ; cpuflag
> + mulps m1, m0, [srcq+lenq]
> + mulps m2, m0, [srcq+lenq+1*mmsize]
> +%if mmsize < 32
> + mulps m3, m0, [srcq+lenq+2*mmsize]
> + mulps m4, m0, [srcq+lenq+3*mmsize]
> +%endif ; mmsize
> + addps m1, m1, [dstq+lenq]
> + addps m2, m2, [dstq+lenq+1*mmsize]
> +%if mmsize < 32
> + addps m3, m3, [dstq+lenq+2*mmsize]
> + addps m4, m4, [dstq+lenq+3*mmsize]
> +%endif ; mmsize
> +%endif ; cpuflag
> + mova [dstq+lenq], m1
> + mova [dstq+lenq+1*mmsize], m2
> +%if mmsize < 32
> + mova [dstq+lenq+2*mmsize], m3
> + mova [dstq+lenq+3*mmsize], m4
> +%endif ; mmsize
> sub lenq, 64
> jge .loop
> REP_RET
> --
> 1.8.3.2
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
More information about the ffmpeg-devel
mailing list