[FFmpeg-devel] [PATCH] h264pred16x16 plane sse2/ssse3 optimizations

Wed Sep 29 14:51:02 CEST 2010

On Tue, Sep 28, 2010 at 10:31:51PM -0400, Ronald S. Bultje wrote:
> Hi,
> 
> this appeared high on my cathedral profiling, so I'm tackling this one
> first. Can/will do 8x8 and 4x4 + mmx/mmx2 functions later also (Jason
> tells me x264 uses 8x8/4x4 plane-mode a lot), but it's quite a bit of
> testing so I thought I'd ask for review of this piece already. Jason
> also tells me there's code in x264 that I should look at but somehow
> it looks completely different/incompatible so I'm not sure if I'm
> looking at the right place/version...
> 
> make fate-h264, fate-svq3 and fate-real-rv40 pass with this patch
> (tested h264 both with and without ssse3 enabled).
> 
> Numbers (Core i7, x86-64, OSX 10.6.4, cathedral sample):
> 
> before: 6719 dezicycles in pred16x16_plane, 262062 runs, 82 skips
> after: 1170 dezicycles in pred16x16_plane, 262128 runs, 16 skips
> (83% speedup)
> 
> time before:
> 8.398
> 8.382
> 8.309
> (avg 8.363)
> 
> after:
> 8.000
> 8.072
> 8.130
> (avg 8.067, ~3.6% faster)
> 
> Didn't profile svq3/rv40, speedup is of course sample-dependent. And
> Diego owes me beer now (5%!).
> 
[...]
> +%macro H264_PRED16x16_PLANE_XMM 3
> +cglobal pred16x16_plane_%3_%1, 2, 7, %2
> +    mov          r2, r1           ; +stride
> +    neg          r1               ; -stride
> +
> +    movh         m0, [r0+r1  -1]
> +%ifidn %1, sse2
> +    pxor         m2, m2
> +    movh         m1, [r0+r1  +8]
> +    punpcklbw    m0, m2
> +    punpcklbw    m1, m2
> +    pmullw       m0, [pw_m8tom1]
> +    pmullw       m1, [pw_1to8]
> +    paddw        m0, m1
> +%else
> +    movhps       m0, [r0+r1  +8]
> +    pmaddubsw    m0, [plane_shuf] ; H coefficients
> +%endif
> +    movhlps      m1, m0
> +    paddw        m0, m1
> +    pshuflw      m1, m0, 0xE
> +    paddw        m0, m1
> +    pshuflw      m1, m0, 0x1
> +    paddw        m0, m1           ; sum of H coefficients
> +
> +%ifidn %3, h264
> +    pmullw       m0, [pw_5]
> +    paddw        m0, [pw_32]
> +    psraw        m0, 6
> +%elifidn %3, rv40
> +    pmullw       m0, [pw_5]
> +    psraw        m0, 6
> +%elifidn %3, svq3
> +    movd         r3, m0
> +    movsx        r3, r3w
> +    test         r3, r3
> +    lea          r4, [r3+3]
> +    cmovs        r3, r4
> +    sar          r3, 2           ; H/4
> +    lea          r3, [r3*5]      ; 5*(H/4)
> +    test         r3, r3
> +    lea          r4, [r3+15]
> +    cmovs        r3, r4
> +    sar          r3, 4           ; (5*(H/4))/16
> +    movd         m0, r3d
> +%endif
> +
> +    lea          r4, [r0+r2*8-1]
> +    lea          r3, [r0+r2*4-1]
> +    add          r4, r2
> +
> +%ifdef ARCH_X86_64
> +%define e_reg r11
> +%else
> +%define e_reg r0
> +%endif
> +

i see alot of r0-1 maybe r0 could be decreased by 1 somewhere?

> +    movzx     e_reg, byte [r3+r1    ]
> +    movzx        r5, byte [r4+r2*2  ]
> +    sub          r5, e_reg
> +    shl          r5, 2
> +
> +    movzx     e_reg, byte [r3       ]
> +    movzx        r6, byte [r4+r2    ]
> +    sub          r6, e_reg
> +    lea          r5, [r5+r6*4]
> +    sub          r5, r6
> +
> +    movzx     e_reg, byte [r3+r2    ]
> +    movzx        r6, byte [r4       ]
> +    sub          r6, e_reg
> +    lea          r5, [r5+r6*2]
> +
> +    movzx     e_reg, byte [r3+r2*2  ]
> +    movzx        r6, byte [r4+r1    ]
> +    sub          r6, e_reg
> +    add          r5, r6

this and the shl 2 case look like they could be merged like
add+shl->lea

> +
> +    lea          r3, [r4+r2*4  ]
> +
> +    movzx     e_reg, byte [r0+r1  -1]
> +    movzx        r6, byte [r3+r2*2  ]
> +    sub          r6, e_reg
> +    lea          r5, [r5+r6*8]
> +
> +    movzx     e_reg, byte [r0     -1]
> +    movzx        r6, byte [r3+r2    ]
> +    sub          r6, e_reg
> +    lea          r5, [r5+r6*8]
> +    sub          r5, r6

the *7 with lea + sub can maybe be changed to a add into the *8 case and a
subtract (replacing lea by add)

> +
> +    movzx     e_reg, byte [r0+r2  -1]
> +    movzx        r6, byte [r3       ]
> +    sub          r6, e_reg
> +    lea          r5, [r5+r6*4]
> +    lea          r5, [r5+r6*2]

this could add into *4 and *2 cases to replace the 2 leas by 2 adds
or to leas *2 into the *3 case redusing the 2 leas to 1
similar tricks may be possible elsewhere

[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

> ... defining _GNU_SOURCE...
For the love of all that is holy, and some that is not, don't do that.
-- Luca & Mans
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 198 bytes
Desc: Digital signature
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20100929/10842098/attachment.pgp>