[FFmpeg-devel] [PATCH] avcodec/takdec: add x86 SIMD for rest of decorrelation modes
James Almer
jamrial at gmail.com
Tue Oct 6 00:00:08 CEST 2015
On 10/5/2015 6:34 PM, Paul B Mahol wrote:
> diff --git a/libavcodec/x86/takdsp.asm b/libavcodec/x86/takdsp.asm
> new file mode 100644
> index 0000000..0158d4d
> --- /dev/null
> +++ b/libavcodec/x86/takdsp.asm
> @@ -0,0 +1,94 @@
> +;******************************************************************************
> +;* TAK DSP SIMD optimizations
> +;*
> +;* Copyright (C) 2015 Paul B Mahol
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or
> +;* modify it under the terms of the GNU Lesser General Public
> +;* License as published by the Free Software Foundation; either
> +;* version 2.1 of the License, or (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> +;* Lesser General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU Lesser General Public
> +;* License along with FFmpeg; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> +;******************************************************************************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION_RODATA
> +
> +pd_128: dd 128
> +
> +SECTION .text
> +
> +INIT_XMM sse2
> +cglobal tak_decorrelate_ls, 3, 3, 2, p1, p2, length
> + .loop:
> + mova m0, [p1q]
> + mova m1, [p2q]
> + paddd m0, m1
paddd m0, [p2q]
> + mova [p2q], m0
> + add p1q, mmsize
> + add p2q, mmsize
> + sub lengthd, mmsize/4
Do the neg trick Hendrik told you about for the maskedmerge filter. That
way you will only need to do an add on the length register per loop.
Also, if the buffer is properly padded you could do 32 bytes at a time
instead of 16.
Same applies to the other functions.
> + jg .loop
> + REP_RET
> +
> +cglobal tak_decorrelate_sr, 3, 3, 2, p1, p2, length
> + .loop:
> + mova m0, [p1q]
> + mova m1, [p2q]
> + psubd m1, m0
> + mova [p1q], m0
> + add p1q, mmsize
> + add p2q, mmsize
> + sub lengthd, mmsize/4
> + jg .loop
> + REP_RET
> +
> +cglobal tak_decorrelate_sm, 3, 3, 3, p1, p2, length
> + .loop:
> + mova m0, [p1q]
> + mova m1, [p2q]
> + mova m2, m1
> + psrld m2, 1
> + psubd m0, m2
> + paddd m1, m0
> + mova [p1q], m0
> + mova [p2q], m1
> + add p1q, mmsize
> + add p2q, mmsize
> + sub lengthd, mmsize/4
> + jg .loop
> + REP_RET
> +
> +INIT_XMM sse4
> +cglobal tak_decorrelate_sf, 5, 5, 5, p1, p2, length, dshift, dfactor
> + movd m2, dshiftm
> + movd m3, dfactorm
Change the cglobal line to 3, 3, 5. On x86_32 it will prevent the
unnecessary load of the last two arguments on gprs.
> + pshufd m3, m3, 0
> + movd m4, [pd_128]
Change the pd_128 constant in Rodata to "times 4 dd 128" then just
do a mova m4, [pd_128]. It will save you the pshufd below.
> + pshufd m4, m4, 0
> +
> + .loop:
> + mova m0, [p1q]
> + mova m1, [p2q]
> + psrld m1, m2
> + pmulld m1, m3
> + paddd m1, m4
> + psrld m1, 8
> + pslld m1, m2
> + psubd m1, m0
> + mova [p1q], m1
> + add p1q, mmsize
> + add p2q, mmsize
> + sub lengthd, mmsize/4
> + jg .loop
> + REP_RET
More information about the ffmpeg-devel
mailing list