[FFmpeg-devel] [PATCH] avfilter/vf_w3fdif: add x86 SIMD

Thu Oct 8 19:48:07 CEST 2015

On 10/8/2015 2:02 PM, Paul B Mahol wrote:
> diff --git a/libavfilter/x86/vf_w3fdif.asm b/libavfilter/x86/vf_w3fdif.asm
> new file mode 100644
> index 0000000..96b61d7
> --- /dev/null
> +++ b/libavfilter/x86/vf_w3fdif.asm
> @@ -0,0 +1,284 @@
> +;*****************************************************************************
> +;* x86-optimized functions for w3fdif filter
> +;*
> +;* Copyright (c) 2015 Paul B Mahol
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or
> +;* modify it under the terms of the GNU Lesser General Public
> +;* License as published by the Free Software Foundation; either
> +;* version 2.1 of the License, or (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +;* Lesser General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU Lesser General Public
> +;* License along with FFmpeg; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> +;******************************************************************************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION_RODATA
> +
> +pd_0: times 4 dd 0

Just use pxor to zero a register.

> +pd_2_23: times 4 dd 256*256*128
> +
> +SECTION .text
> +
> +INIT_XMM sse4
> +cglobal w3fdif_scale, 3, 3, 3, 0, out_pixel, work_pixel, linesize
> +    mova                  m1, [pd_0]
> +    mova                  m2, [pd_2_23]
> +    shr            linesized, 2
> +
> +    .loop
> +    mova                         m0, [work_pixelq]
> +    pmaxsd                       m0, m1
> +    pminsd                       m0, m2

You can emulate these two using sse2 instructions. See CLIPD_SSE2 (using
float conversion) and CLIPD_MMX in x86util.asm

> +    psrld                        m0, 15
> +    packusdw                     m0, m0
> +    packuswb                     m0, m0
> +    movd               [out_pixelq], m0
> +    add                  out_pixelq, mmsize/4
> +    add                 work_pixelq, mmsize
> +    sub                   linesized, 1
> +    jg .loop
> +REP_RET
> +
> +INIT_XMM sse2
> +cglobal w3fdif_simple_low, 4, 6, 5, 0, work_line, in_lines_cur0, coef, linesize
> +    movd                  m0, [coefq+0]
> +    movd                  m1, [coefq+2]

movd m1, [coefq]
SPLATW m0, m1, 0
SPLATW m1, m1, 1

> +    SPLATW                m0, m0
> +    SPLATW                m1, m1
> +    shr            linesized, 3
> +    mov                  r4q, 0
> +    mov                  r5q, [in_lines_cur0q + gprsize]
> +    mov       in_lines_cur0q, [in_lines_cur0q]
> +    %define   in_lines_cur1q  r5q
> +
> +    .loop
> +    movh                            m2, [in_lines_cur0q+r4q]
> +    movh                            m3, [in_lines_cur1q+r4q]
> +    pxor                            m4, m4
> +    punpcklbw                       m2, m4
> +    punpcklbw                       m3, m4
> +    SBUTTERFLY                      wd, 2, 3, 4
> +    pmaddwd                         m2, m0
> +    pmaddwd                         m3, m1
> +    mova            [work_lineq+r4q*4], m2
> +    mova     [work_lineq+r4q*4+mmsize], m3
> +    add                            r4q, 8
> +    sub                      linesized, 1
> +    jg .loop
> +REP_RET
> +
> +cglobal w3fdif_simple_high, 5, 10, 8, 0, work_line, in_lines_cur0, in_lines_adj0, coef, linesize

This is clearly not x86_32 friendly, so you will either have to get it working
using 7 regs, or mark it as x86_64 only.

> +    movd                  m0, [coefq+0]
> +    movd                  m1, [coefq+2]
> +    movd                  m2, [coefq+4]

movq m2, [coefq]
SPLATW m0, m2, 0
SPLATW m1, m2, 1
SPLATW m2, m2, 2

And so for every function.

> +    SPLATW                m0, m0
> +    SPLATW                m1, m1
> +    SPLATW                m2, m2
> +    SBUTTERFLY            wd, 0, 1, 7
> +    shr            linesized, 3

Seems pointless if the only other instruction using this reg is a sub at the
end of the loop.
Can't you do the neg trick on linesize and use that as part of the effective
addresses inside the loop, instead of a zeroed r5q?

> +    mov                  r5q, 0
> +    mov                  r7q, [in_lines_cur0q+gprsize*2]
> +    mov                  r6q, [in_lines_cur0q+gprsize]
> +    mov       in_lines_cur0q, [in_lines_cur0q]
> +    %define   in_lines_cur1q  r6q
> +    %define   in_lines_cur2q  r7q

Instead of defining their names here, just name them in the cglobal line.
You can name registers there that aren't function arguments just fine.

Both the above suggestions apply to other functions as well.