[FFmpeg-devel] [PATCH 03/14] vp9: add x86 simd (sse2/ssse3) for iadst4 10bpp functions.

Ronald S. Bultje rsbultje at gmail.com
Mon Oct 12 16:28:45 CEST 2015


---
 libavcodec/x86/vp9dsp_init.h                |  6 ++
 libavcodec/x86/vp9dsp_init_16bpp_template.c | 21 ++++++-
 libavcodec/x86/vp9itxfm.asm                 | 58 -----------------
 libavcodec/x86/vp9itxfm_16bpp.asm           | 96 ++++++++++++++++++++++-------
 libavcodec/x86/vp9itxfm_template.asm        | 58 +++++++++++++++++
 5 files changed, 157 insertions(+), 82 deletions(-)

diff --git a/libavcodec/x86/vp9dsp_init.h b/libavcodec/x86/vp9dsp_init.h
index 5d07b62..b3b0558 100644
--- a/libavcodec/x86/vp9dsp_init.h
+++ b/libavcodec/x86/vp9dsp_init.h
@@ -62,6 +62,12 @@ void cat(ff_vp9_##typea##_##typeb##_##size##x##size##_add_, bpp, _##opt)(uint8_t
                                                                          int16_t *block, \
                                                                          int eob)
 
+#define decl_itxfm_funcs(size, bpp, opt) \
+decl_itxfm_func(idct,  idct,  size, bpp, opt); \
+decl_itxfm_func(iadst, idct,  size, bpp, opt); \
+decl_itxfm_func(idct,  iadst, size, bpp, opt); \
+decl_itxfm_func(iadst, iadst, size, bpp, opt)
+
 #define mc_rep_func(avg, sz, hsz, hszb, dir, opt, type, f_sz, bpp) \
 static av_always_inline void \
 ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
diff --git a/libavcodec/x86/vp9dsp_init_16bpp_template.c b/libavcodec/x86/vp9dsp_init_16bpp_template.c
index 6e12af3..93fc684 100644
--- a/libavcodec/x86/vp9dsp_init_16bpp_template.c
+++ b/libavcodec/x86/vp9dsp_init_16bpp_template.c
@@ -126,8 +126,11 @@ decl_ipred_fns(tm, BPC, mmxext, sse2);
 
 decl_itxfm_func(iwht, iwht, 4, BPC, mmxext);
 #if BPC == 10
-decl_itxfm_func(idct, idct, 4, BPC, mmxext);
-decl_itxfm_func(idct, idct, 4, BPC, ssse3);
+decl_itxfm_func(idct,  idct,  4, BPC, mmxext);
+decl_itxfm_func(idct,  iadst, 4, BPC, sse2);
+decl_itxfm_func(iadst, idct,  4, BPC, sse2);
+decl_itxfm_func(iadst, iadst, 4, BPC, sse2);
+decl_itxfm_funcs(4, BPC, ssse3);
 #endif
 #endif /* HAVE_YASM */
 
@@ -169,6 +172,11 @@ av_cold void INIT_FUNC(VP9DSPContext *dsp, int bitexact)
     init_itx_func(idx, ADST_DCT,  typea, typeb, size, bpp, opt); \
     init_itx_func(idx, DCT_ADST,  typea, typeb, size, bpp, opt); \
     init_itx_func(idx, ADST_ADST, typea, typeb, size, bpp, opt)
+#define init_itx_funcs(idx, size, bpp, opt) \
+    init_itx_func(idx, DCT_DCT,   idct,  idct,  size, bpp, opt); \
+    init_itx_func(idx, ADST_DCT,  idct,  iadst, size, bpp, opt); \
+    init_itx_func(idx, DCT_ADST,  iadst, idct,  size, bpp, opt); \
+    init_itx_func(idx, ADST_ADST, iadst, iadst, size, bpp, opt); \
 
     if (EXTERNAL_MMXEXT(cpu_flags)) {
         init_ipred_func(tm, TM_VP8, 4, BPC, mmxext);
@@ -185,13 +193,20 @@ av_cold void INIT_FUNC(VP9DSPContext *dsp, int bitexact)
         init_subpel3(1, avg, BPC, sse2);
         init_lpf_funcs(BPC, sse2);
         init_8_16_32_ipred_funcs(tm, TM_VP8, BPC, sse2);
+#if BPC == 10
+        if (!bitexact) {
+            init_itx_func(TX_4X4, ADST_DCT,  idct,  iadst, 4, 10, sse2);
+            init_itx_func(TX_4X4, DCT_ADST,  iadst, idct,  4, 10, sse2);
+            init_itx_func(TX_4X4, ADST_ADST, iadst, iadst, 4, 10, sse2);
+        }
+#endif
     }
 
     if (EXTERNAL_SSSE3(cpu_flags)) {
         init_lpf_funcs(BPC, ssse3);
 #if BPC == 10
         if (!bitexact) {
-            init_itx_func(TX_4X4, DCT_DCT, idct, idct, 4, 10, ssse3);
+            init_itx_funcs(TX_4X4, 4, BPC, ssse3);
         }
 #endif
     }
diff --git a/libavcodec/x86/vp9itxfm.asm b/libavcodec/x86/vp9itxfm.asm
index 200f15e..a3e0f86 100644
--- a/libavcodec/x86/vp9itxfm.asm
+++ b/libavcodec/x86/vp9itxfm.asm
@@ -289,64 +289,6 @@ IDCT_4x4_FN ssse3
 ; void vp9_iadst_iadst_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
 ;-------------------------------------------------------------------------------------------
 
-%macro VP9_IADST4_1D 0
-    movq2dq           xmm0, m0
-    movq2dq           xmm1, m1
-    movq2dq           xmm2, m2
-    movq2dq           xmm3, m3
-%if cpuflag(ssse3)
-    paddw               m3, m0
-%endif
-    punpcklwd         xmm0, xmm1
-    punpcklwd         xmm2, xmm3
-    pmaddwd           xmm1, xmm0, [pw_5283_13377]
-    pmaddwd           xmm4, xmm0, [pw_9929_13377]
-%if notcpuflag(ssse3)
-    pmaddwd           xmm6, xmm0, [pw_13377_0]
-%endif
-    pmaddwd           xmm0, [pw_15212_m13377]
-    pmaddwd           xmm3, xmm2, [pw_15212_9929]
-%if notcpuflag(ssse3)
-    pmaddwd           xmm7, xmm2, [pw_m13377_13377]
-%endif
-    pmaddwd           xmm2, [pw_m5283_m15212]
-%if cpuflag(ssse3)
-    psubw               m3, m2
-%else
-    paddd             xmm6, xmm7
-%endif
-    paddd             xmm0, xmm2
-    paddd             xmm3, xmm5
-    paddd             xmm2, xmm5
-%if notcpuflag(ssse3)
-    paddd             xmm6, xmm5
-%endif
-    paddd             xmm1, xmm3
-    paddd             xmm0, xmm3
-    paddd             xmm4, xmm2
-    psrad             xmm1, 14
-    psrad             xmm0, 14
-    psrad             xmm4, 14
-%if cpuflag(ssse3)
-    pmulhrsw            m3, [pw_13377x2]        ; out2
-%else
-    psrad             xmm6, 14
-%endif
-    packssdw          xmm0, xmm0
-    packssdw          xmm1, xmm1
-    packssdw          xmm4, xmm4
-%if notcpuflag(ssse3)
-    packssdw          xmm6, xmm6
-%endif
-    movdq2q             m0, xmm0                ; out3
-    movdq2q             m1, xmm1                ; out0
-    movdq2q             m2, xmm4                ; out1
-%if notcpuflag(ssse3)
-    movdq2q             m3, xmm6                ; out2
-%endif
-    SWAP                 0, 1, 2, 3
-%endmacro
-
 %macro IADST4_FN 5
 INIT_MMX %5
 cglobal vp9_%1_%3_4x4_add, 3, 3, 0, dst, stride, block, eob
diff --git a/libavcodec/x86/vp9itxfm_16bpp.asm b/libavcodec/x86/vp9itxfm_16bpp.asm
index e067438..cf7c5d6 100644
--- a/libavcodec/x86/vp9itxfm_16bpp.asm
+++ b/libavcodec/x86/vp9itxfm_16bpp.asm
@@ -38,6 +38,15 @@ pw_m15137_6270: times 4 dw -15137, 6270
 pw_6270_15137: times 4 dw 6270, 15137
 pw_11585x2: times 8 dw 11585*2
 
+pw_5283_13377: times 4 dw 5283, 13377
+pw_9929_13377: times 4 dw 9929, 13377
+pw_15212_m13377: times 4 dw 15212, -13377
+pw_15212_9929: times 4 dw 15212, 9929
+pw_m5283_m15212: times 4 dw -5283, -15212
+pw_13377x2: times 8 dw 13377*2
+pw_m13377_13377: times 4 dw -13377, 13377
+pw_13377_0: times 4 dw 13377, 0
+
 SECTION .text
 
 %macro VP9_STORE_2X 6-7 dstq ; reg1, reg2, tmp1, tmp2, min, max, dst
@@ -129,6 +138,30 @@ IWHT4_FN 10, 1023
 INIT_MMX mmxext
 IWHT4_FN 12, 4095
 
+%macro VP9_IDCT4_WRITEOUT 0
+%if cpuflag(ssse3)
+    mova                m5, [pw_2048]
+    pmulhrsw            m0, m5
+    pmulhrsw            m1, m5
+    pmulhrsw            m2, m5
+    pmulhrsw            m3, m5
+%else
+    mova                m5, [pw_8]
+    paddw               m0, m5
+    paddw               m1, m5
+    paddw               m2, m5
+    paddw               m3, m5
+    psraw               m0, 4
+    psraw               m1, 4
+    psraw               m2, 4
+    psraw               m3, 4
+%endif
+    mova                m5, [pw_1023]
+    VP9_STORE_2X         0,  1,  6,  7,  4,  5
+    lea               dstq, [dstq+2*strideq]
+    VP9_STORE_2X         2,  3,  6,  7,  4,  5
+%endmacro
+
 ; 4x4 coefficients are 5+depth+sign bits, so for 10bpp, everything still fits
 ; in 15+1 words without additional effort, since the coefficients are 15bpp.
 
@@ -186,27 +219,7 @@ cglobal vp9_idct_idct_4x4_add_10, 4, 4, 8, dst, stride, block, eob
 
     pxor                m4, m4
     ZERO_BLOCK      blockq, 16, 4, m4
-%if cpuflag(ssse3)
-    mova                m5, [pw_2048]
-    pmulhrsw            m0, m5
-    pmulhrsw            m1, m5
-    pmulhrsw            m2, m5
-    pmulhrsw            m3, m5
-%else
-    mova                m5, [pw_8]
-    paddw               m0, m5
-    paddw               m1, m5
-    paddw               m2, m5
-    paddw               m3, m5
-    psraw               m0, 4
-    psraw               m1, 4
-    psraw               m2, 4
-    psraw               m3, 4
-%endif
-    mova                m5, [pw_1023]
-    VP9_STORE_2X         0,  1,  6,  7,  4,  5
-    lea               dstq, [dstq+2*strideq]
-    VP9_STORE_2X         2,  3,  6,  7,  4,  5
+    VP9_IDCT4_WRITEOUT
     RET
 %endmacro
 
@@ -214,3 +227,44 @@ INIT_MMX mmxext
 IDCT4_10_FN
 INIT_MMX ssse3
 IDCT4_10_FN
+
+%macro IADST4_FN 4
+cglobal vp9_%1_%3_4x4_add_10, 3, 3, 0, dst, stride, block, eob
+%if WIN64 && notcpuflag(ssse3)
+    WIN64_SPILL_XMM 8
+%endif
+    movdqa            xmm5, [pd_8192]
+    mova                m0, [blockq+0*16+0]
+    mova                m1, [blockq+1*16+0]
+    packssdw            m0, [blockq+0*16+8]
+    packssdw            m1, [blockq+1*16+8]
+    mova                m2, [blockq+2*16+0]
+    mova                m3, [blockq+3*16+0]
+    packssdw            m2, [blockq+2*16+8]
+    packssdw            m3, [blockq+3*16+8]
+
+%if cpuflag(ssse3)
+    mova                m6, [pw_11585x2]
+%endif
+%ifnidn %1%3, iadstiadst
+    movdq2q             m7, xmm5
+%endif
+    VP9_%2_1D
+    TRANSPOSE4x4W  0, 1, 2, 3, 4
+    VP9_%4_1D
+
+    pxor                m4, m4
+    ZERO_BLOCK      blockq, 16, 4, m4
+    VP9_IDCT4_WRITEOUT
+    RET
+%endmacro
+
+INIT_MMX sse2
+IADST4_FN idct,  IDCT4,  iadst, IADST4
+IADST4_FN iadst, IADST4, idct,  IDCT4
+IADST4_FN iadst, IADST4, iadst, IADST4
+
+INIT_MMX ssse3
+IADST4_FN idct,  IDCT4,  iadst, IADST4
+IADST4_FN iadst, IADST4, idct,  IDCT4
+IADST4_FN iadst, IADST4, iadst, IADST4
diff --git a/libavcodec/x86/vp9itxfm_template.asm b/libavcodec/x86/vp9itxfm_template.asm
index f1a05a5..d2f2257 100644
--- a/libavcodec/x86/vp9itxfm_template.asm
+++ b/libavcodec/x86/vp9itxfm_template.asm
@@ -82,3 +82,61 @@
     VP9_UNPACK_MULSUB_2W_4X 1, 3, 15137, 6270, m7, 4, 5     ; m1=t2, m3=t3
     VP9_IDCT4_1D_FINALIZE
 %endmacro
+
+%macro VP9_IADST4_1D 0
+    movq2dq           xmm0, m0
+    movq2dq           xmm1, m1
+    movq2dq           xmm2, m2
+    movq2dq           xmm3, m3
+%if cpuflag(ssse3)
+    paddw               m3, m0
+%endif
+    punpcklwd         xmm0, xmm1
+    punpcklwd         xmm2, xmm3
+    pmaddwd           xmm1, xmm0, [pw_5283_13377]
+    pmaddwd           xmm4, xmm0, [pw_9929_13377]
+%if notcpuflag(ssse3)
+    pmaddwd           xmm6, xmm0, [pw_13377_0]
+%endif
+    pmaddwd           xmm0, [pw_15212_m13377]
+    pmaddwd           xmm3, xmm2, [pw_15212_9929]
+%if notcpuflag(ssse3)
+    pmaddwd           xmm7, xmm2, [pw_m13377_13377]
+%endif
+    pmaddwd           xmm2, [pw_m5283_m15212]
+%if cpuflag(ssse3)
+    psubw               m3, m2
+%else
+    paddd             xmm6, xmm7
+%endif
+    paddd             xmm0, xmm2
+    paddd             xmm3, xmm5
+    paddd             xmm2, xmm5
+%if notcpuflag(ssse3)
+    paddd             xmm6, xmm5
+%endif
+    paddd             xmm1, xmm3
+    paddd             xmm0, xmm3
+    paddd             xmm4, xmm2
+    psrad             xmm1, 14
+    psrad             xmm0, 14
+    psrad             xmm4, 14
+%if cpuflag(ssse3)
+    pmulhrsw            m3, [pw_13377x2]        ; out2
+%else
+    psrad             xmm6, 14
+%endif
+    packssdw          xmm0, xmm0
+    packssdw          xmm1, xmm1
+    packssdw          xmm4, xmm4
+%if notcpuflag(ssse3)
+    packssdw          xmm6, xmm6
+%endif
+    movdq2q             m0, xmm0                ; out3
+    movdq2q             m1, xmm1                ; out0
+    movdq2q             m2, xmm4                ; out1
+%if notcpuflag(ssse3)
+    movdq2q             m3, xmm6                ; out2
+%endif
+    SWAP                 0, 1, 2, 3
+%endmacro
-- 
2.1.2



More information about the ffmpeg-devel mailing list