[FFmpeg-devel] [PATCH] SSE2 version of vf_idet's filter_line()

Wed Sep 3 21:37:03 CEST 2014

On Wed, Sep 03, 2014 at 07:05:48PM +0200, Pascal Massimino wrote:
[...]
> > > +    punpcklbw m3, m_zero
> > > +    punpckhbw m4, m_zero
> > > +
> > > +    paddsw    m0, m3
> > > +    paddsw    m1, m4
> > > +
> > > +    movq      m3, [bq+indexq*1]
> > > +    movq      m4, m3
> > > +    punpcklbw m3, m_zero
> > > +    punpckhbw m4, m_zero
> > > +
> > > +    paddw     m3, m3
> > > +    paddw     m4, m4
> > > +    psubsw    m0, m3
> > > +    psubsw    m1, m4
> > > +
> >
> > > +    ABS1      m0, m5
> > > +    ABS1      m1, m5
> >
> > ABS2?
> >
> 
> ABS2 requires the two tmp registers to be different (can't use m5 for both).
> 

Aren't m3 and m4 available at that point?

(ABS2 has the benefit of doing some pairing so is faster than doing two
ABS1)

> 
> >
> > > +    paddw     m0, m1
> > > +    movq      m1, m0
> > > +    punpcklwd m0, m_zero
> > > +    punpckhwd m1, m_zero
> > > +    paddd     m0, m1
> > > +    paddd     m_sum, m0
> > > +
> > > +    add       indexq, 0x8
> >
> > > +    CMP       widthq, indexq
> >
> > Someone needs to confirm this, but I think you'll need to make width a
> > ptrdiff_t and not an int
> >
> 
> changed to widthd/indexd, that's enough.
> 

Hopefully...

> 
> >
> > Also... stupid question but what's CMP?
> >
> 
> it's equivalent to 'cmp DWORD' here iirc.
> 

I believe you can keep it lowercase. I thought it was a macro but didn't
see anything like this.

> 
> 
[...]
> diff --git a/libavfilter/x86/vf_idet.asm b/libavfilter/x86/vf_idet.asm
> new file mode 100644
> index 0000000..19b7f3b
> --- /dev/null
> +++ b/libavfilter/x86/vf_idet.asm
> @@ -0,0 +1,116 @@
> +;; *****************************************************************************
> +;; * x86-optimized functions for idet filter
> +;; *
> +;; * This file is part of FFmpeg.
> +;; *
> +;; * FFmpeg is free software; you can redistribute it and/or modify
> +;; * it under the terms of the GNU General Public License as published by
> +;; * the Free Software Foundation; either version 2 of the License, or
> +;; * (at your option) any later version.
> +;; *
> +;; * FFmpeg is distributed in the hope that it will be useful,
> +;; * but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +;; * GNU General Public License for more details.
> +;; *
> +;; * You should have received a copy of the GNU General Public License along
> +;; * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
> +;; * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
> +;; ******************************************************************************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION_TEXT
> +
> +; Implementation that does 8-bytes at a time using single-word operations.
> +%macro IDET_FILTER_LINE 0

> +cglobal idet_filter_line, 4, 8, 6, a, b, c, width, index

Shouldn't this be 4,5,8? 4 args, 5 regs (4 args + 1 local var), 8 xmm (m0..m7)

And BTW, you don't seem to use m2, so you can have 4,5,7 ; you probably
want to define m_zero on m2 to avoid any confusion.

Now, this macro is only used for MMX, so you can probably do 4,5,0 (no XMM
reg). If you do 4,5,0, please move the INIT_MMX into the macro:

    %macro IDET_FILTER_LINE_MMX 1
    INIT_MMX %1
    ...
    %end macro

    ...

    INIT_MMX mmx
    INIT_MMX mmxext

> +    xor       indexq, indexq
> +%define   m_zero m7
> +%define   m_sum  m6
> +    pxor      m_sum, m_sum
> +    pxor      m_zero, m_zero
> +
> +.loop:
> +    movu      m0, [aq+indexq*1]
> +    punpckhbw m1, m0, m_zero
> +    punpcklbw m0, m_zero
> +
> +    movu      m3, [cq+indexq*1]
> +    punpckhbw m4, m3, m_zero
> +    punpcklbw m3, m_zero
> +
> +    paddsw    m1, m4
> +    paddsw    m0, m3
> +
> +    movu      m3, [bq+indexq*1]
> +    punpckhbw m4, m3, m_zero
> +    punpcklbw m3, m_zero
> +
> +    paddw     m4, m4
> +    paddw     m3, m3
> +    psubsw    m1, m4
> +    psubsw    m0, m3
> +
> +    ABS1      m1, m5
> +    ABS1      m0, m5
> +
> +    paddw     m0, m1
> +    punpckhwd m1, m0, m_zero
> +    punpcklwd m0, m_zero
> +
> +    paddd     m0, m1
> +    paddd     m_sum, m0
> +
> +    add       indexq, 0x8
> +    CMP       widthd, indexd
> +    jg        .loop
> +
> +    mova      m0, m_sum
> +    psrlq     m_sum, 0x20
> +    paddq     m0, m_sum
> +    movd      eax, m0
> +    RET
> +%endmacro
> +

> +%if ARCH_X86_32
> +INIT_MMX mmxext
> +IDET_FILTER_LINE
> +
> +INIT_MMX mmx
> +IDET_FILTER_LINE
> +%endif
> +
> +;; SSE2 8-bit implementation that does 16-bytes at a time:
> +INIT_XMM sse2

> +cglobal idet_filter_line, 4, 8, 6, a, b, c, width, index, total

4,6,7, AFAICT

> +    xor       indexq, indexq
> +    pxor      m0, m0
> +    pxor      m1, m1
> +
> +.sse2_loop:
> +    movu      m2, [bq+indexq*1]  ; B
> +    movu      m3, [aq+indexq*1]  ; A
> +    mova      m6, m2
> +    mova      m4, m3
> +    psubusb   m5, m2, m3         ; ba
> +
> +    movu      m3, [cq+indexq*1]  ; C
> +    add       indexq, 0x10
> +    psubusb   m4, m2             ; ab
> +    CMP       indexd, widthd
> +
> +    psubusb   m6, m3             ; bc
> +    psubusb   m3, m2             ; cb
> +
> +    psadbw    m4, m6             ; |ab - bc|
> +    paddq     m0, m4
> +    psadbw    m5, m3             ; |ba - cb|
> +    paddq     m1, m5
> +    jl       .sse2_loop
> +
> +    paddq     m0, m1
> +    movhlps   m1, m0
> +    paddq     m0, m1
> +    movd      eax, m0
> +    RET
[...]

-- 
Clément B.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 473 bytes
Desc: not available
URL: <https://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20140903/da197f38/attachment.asc>