[FFmpeg-devel] [PATCH] avfilter/vf_w3fdif: add x86 SIMD
James Almer
jamrial at gmail.com
Thu Oct 8 19:48:07 CEST 2015
On 10/8/2015 2:02 PM, Paul B Mahol wrote:
> diff --git a/libavfilter/x86/vf_w3fdif.asm b/libavfilter/x86/vf_w3fdif.asm
> new file mode 100644
> index 0000000..96b61d7
> --- /dev/null
> +++ b/libavfilter/x86/vf_w3fdif.asm
> @@ -0,0 +1,284 @@
> +;*****************************************************************************
> +;* x86-optimized functions for w3fdif filter
> +;*
> +;* Copyright (c) 2015 Paul B Mahol
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or
> +;* modify it under the terms of the GNU Lesser General Public
> +;* License as published by the Free Software Foundation; either
> +;* version 2.1 of the License, or (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> +;* Lesser General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU Lesser General Public
> +;* License along with FFmpeg; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> +;******************************************************************************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION_RODATA
> +
> +pd_0: times 4 dd 0
Just use pxor to zero a register.
> +pd_2_23: times 4 dd 256*256*128
> +
> +SECTION .text
> +
> +INIT_XMM sse4
> +cglobal w3fdif_scale, 3, 3, 3, 0, out_pixel, work_pixel, linesize
> + mova m1, [pd_0]
> + mova m2, [pd_2_23]
> + shr linesized, 2
> +
> + .loop
> + mova m0, [work_pixelq]
> + pmaxsd m0, m1
> + pminsd m0, m2
You can emulate these two using sse2 instructions. See CLIPD_SSE2 (using
float conversion) and CLIPD_MMX in x86util.asm
> + psrld m0, 15
> + packusdw m0, m0
> + packuswb m0, m0
> + movd [out_pixelq], m0
> + add out_pixelq, mmsize/4
> + add work_pixelq, mmsize
> + sub linesized, 1
> + jg .loop
> +REP_RET
> +
> +INIT_XMM sse2
> +cglobal w3fdif_simple_low, 4, 6, 5, 0, work_line, in_lines_cur0, coef, linesize
> + movd m0, [coefq+0]
> + movd m1, [coefq+2]
movd m1, [coefq]
SPLATW m0, m1, 0
SPLATW m1, m1, 1
> + SPLATW m0, m0
> + SPLATW m1, m1
> + shr linesized, 3
> + mov r4q, 0
> + mov r5q, [in_lines_cur0q + gprsize]
> + mov in_lines_cur0q, [in_lines_cur0q]
> + %define in_lines_cur1q r5q
> +
> + .loop
> + movh m2, [in_lines_cur0q+r4q]
> + movh m3, [in_lines_cur1q+r4q]
> + pxor m4, m4
> + punpcklbw m2, m4
> + punpcklbw m3, m4
> + SBUTTERFLY wd, 2, 3, 4
> + pmaddwd m2, m0
> + pmaddwd m3, m1
> + mova [work_lineq+r4q*4], m2
> + mova [work_lineq+r4q*4+mmsize], m3
> + add r4q, 8
> + sub linesized, 1
> + jg .loop
> +REP_RET
> +
> +cglobal w3fdif_simple_high, 5, 10, 8, 0, work_line, in_lines_cur0, in_lines_adj0, coef, linesize
This is clearly not x86_32 friendly, so you will either have to get it working
using 7 regs, or mark it as x86_64 only.
> + movd m0, [coefq+0]
> + movd m1, [coefq+2]
> + movd m2, [coefq+4]
movq m2, [coefq]
SPLATW m0, m2, 0
SPLATW m1, m2, 1
SPLATW m2, m2, 2
And so for every function.
> + SPLATW m0, m0
> + SPLATW m1, m1
> + SPLATW m2, m2
> + SBUTTERFLY wd, 0, 1, 7
> + shr linesized, 3
Seems pointless if the only other instruction using this reg is a sub at the
end of the loop.
Can't you do the neg trick on linesize and use that as part of the effective
addresses inside the loop, instead of a zeroed r5q?
> + mov r5q, 0
> + mov r7q, [in_lines_cur0q+gprsize*2]
> + mov r6q, [in_lines_cur0q+gprsize]
> + mov in_lines_cur0q, [in_lines_cur0q]
> + %define in_lines_cur1q r6q
> + %define in_lines_cur2q r7q
Instead of defining their names here, just name them in the cglobal line.
You can name registers there that aren't function arguments just fine.
Both the above suggestions apply to other functions as well.
More information about the ffmpeg-devel
mailing list