[FFmpeg-devel] [PATCH 2/2] vp9/x86: 4x4 iadst SIMD (ssse3) variants.

Clément Bœsch u at pkh.me
Wed Jan 22 07:46:19 CET 2014


On Mon, Jan 20, 2014 at 05:12:18PM -0500, Ronald S. Bultje wrote:
> Cycle measurements for intra itxfm_4x4_add on ped1080p.webm:
> idct_idct:    66 -> 67 cycles (noise measurement)
> idct_iadst:  199 -> 79 cycles
> iadst_idct:  165 -> 70 cycles
> iadst_iadst: 183 -> 82 cycles
> ---
>  libavcodec/x86/vp9dsp_init.c |  5 +++-
>  libavcodec/x86/vp9itxfm.asm  | 71 ++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 75 insertions(+), 1 deletion(-)
> 
> diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c
> index 9b2cfe1..9c322c1 100644
> --- a/libavcodec/x86/vp9dsp_init.c
> +++ b/libavcodec/x86/vp9dsp_init.c
> @@ -166,7 +166,7 @@ itxfm_func(iadst, idct,  size, opt); \
>  itxfm_func(idct,  iadst, size, opt); \
>  itxfm_func(iadst, iadst, size, opt)
>  
> -itxfm_func(idct, idct,  4, ssse3);
> +itxfm_funcs(4, ssse3);
>  itxfm_funcs(8, ssse3);
>  itxfm_funcs(8, avx);
>  itxfm_funcs(16, ssse3);
> @@ -247,6 +247,9 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
>          init_subpel3(0, put, ssse3);
>          init_subpel3(1, avg, ssse3);
>          dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_ssse3;
> +        dsp->itxfm_add[TX_4X4][ADST_DCT]  = ff_vp9_idct_iadst_4x4_add_ssse3;
> +        dsp->itxfm_add[TX_4X4][DCT_ADST]  = ff_vp9_iadst_idct_4x4_add_ssse3;
> +        dsp->itxfm_add[TX_4X4][ADST_ADST] = ff_vp9_iadst_iadst_4x4_add_ssse3;
>          if (ARCH_X86_64) {
>              dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_ssse3;
>              dsp->itxfm_add[TX_8X8][ADST_DCT]  = ff_vp9_idct_iadst_8x8_add_ssse3;
> diff --git a/libavcodec/x86/vp9itxfm.asm b/libavcodec/x86/vp9itxfm.asm
> index be60f2b..fe9f99a 100644
> --- a/libavcodec/x86/vp9itxfm.asm
> +++ b/libavcodec/x86/vp9itxfm.asm
> @@ -58,6 +58,13 @@ VP9_IDCT_COEFFS  8423, 14053
>  VP9_IDCT_COEFFS 13160,  9760
>  VP9_IDCT_COEFFS  2404, 16207
>  
> +pw_5283_13377: times 4 dw 5283, 13377
> +pw_9929_13377: times 4 dw 9929, 13377
> +pw_15212_m13377: times 4 dw 15212, -13377
> +pw_15212_9929: times 4 dw 15212, 9929
> +pw_m5283_m15212: times 4 dw -5283, -15212
> +pw_13377x2: times 8 dw 13377*2
> +
>  pd_8192: times 4 dd 8192
>  pw_2048: times 8 dw 2048
>  pw_1024: times 8 dw 1024
> @@ -239,6 +246,70 @@ cglobal vp9_idct_idct_4x4_add, 4,4,0, dst, stride, block, eob
>      VP9_IDCT4_WRITEOUT
>      RET
>  
> +;-------------------------------------------------------------------------------------------
> +; void vp9_iadst_iadst_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
> +;-------------------------------------------------------------------------------------------
> +
> +%macro VP9_IADST4_1D 0
> +    movq2dq           xmm0, m0
> +    movq2dq           xmm1, m1
> +    movq2dq           xmm2, m2
> +    movq2dq           xmm3, m3
> +    paddw               m3, m0
> +    punpcklwd         xmm0, xmm1
> +    punpcklwd         xmm2, xmm3
> +    pmaddwd           xmm1, xmm0, [pw_5283_13377]
> +    pmaddwd           xmm4, xmm0, [pw_9929_13377]
> +    pmaddwd           xmm0, [pw_15212_m13377]
> +    pmaddwd           xmm3, xmm2, [pw_15212_9929]
> +    pmaddwd           xmm2, [pw_m5283_m15212]
> +    psubw               m3, m2
> +    paddd             xmm0, xmm2
> +    paddd             xmm3, [pd_8192]
> +    paddd             xmm2, [pd_8192]
> +    paddd             xmm1, xmm3
> +    paddd             xmm0, xmm3
> +    paddd             xmm4, xmm2
> +    psrad             xmm1, 14
> +    psrad             xmm0, 14
> +    psrad             xmm4, 14
> +    pmulhrsw            m3, [pw_13377x2]        ; out2
> +    packssdw          xmm0, xmm0
> +    packssdw          xmm1, xmm1
> +    packssdw          xmm4, xmm4
> +    movdq2q             m0, xmm0                ; out3
> +    movdq2q             m1, xmm1                ; out0
> +    movdq2q             m2, xmm4                ; out1

> +    SWAP                 0, 1
> +    SWAP                 1, 2
> +    SWAP                 2, 3

same as previously, I think you can SWAP 0, 1, 2, 3, or something similar

> +%endmacro
> +
> +%macro IADST4_FN 5
> +INIT_MMX %5
> +cglobal vp9_%1_%3_4x4_add, 3, 3, 8, dst, stride, block, eob
> +    mova                m0, [blockq+ 0]
> +    mova                m1, [blockq+ 8]
> +    mova                m2, [blockq+16]
> +    mova                m3, [blockq+24]
> +    mova                m6, [pw_11585x2]
> +    mova                m7, [pd_8192]       ; rounding
> +    VP9_%2_1D
> +    TRANSPOSE4x4W  0, 1, 2, 3, 4
> +    VP9_%4_1D
> +    pxor                m4, m4  ; used for the block reset, and VP9_STORE_2X
> +    mova       [blockq+ 0], m4
> +    mova       [blockq+ 8], m4
> +    mova       [blockq+16], m4
> +    mova       [blockq+24], m4
> +    VP9_IDCT4_WRITEOUT
> +    RET
> +%endmacro
> +
> +IADST4_FN idct,  IDCT4,  iadst, IADST4, ssse3
> +IADST4_FN iadst, IADST4, idct,  IDCT4,  ssse3
> +IADST4_FN iadst, IADST4, iadst, IADST4, ssse3
> +
>  %if ARCH_X86_64 ; TODO: 32-bit? (32-bit limited to 8 xmm reg, we use more)
>  
>  ;-------------------------------------------------------------------------------------------

LGTM

-- 
Clément B.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 490 bytes
Desc: not available
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20140122/061e6d69/attachment.asc>


More information about the ffmpeg-devel mailing list