[FFmpeg-devel] [PATCH] avcodec/vp9: add vp9_idct_idct_4x4_add_ssse3

Clément Bœsch u at pkh.me
Mon Oct 28 20:56:29 CET 2013


---
 libavcodec/x86/vp9dsp.asm    | 141 +++++++++++++++++++++++++++++++++++++++++++
 libavcodec/x86/vp9dsp_init.c |   3 +
 2 files changed, 144 insertions(+)

diff --git a/libavcodec/x86/vp9dsp.asm b/libavcodec/x86/vp9dsp.asm
index f81ac72..228a0a6 100644
--- a/libavcodec/x86/vp9dsp.asm
+++ b/libavcodec/x86/vp9dsp.asm
@@ -83,8 +83,149 @@ const filters_ssse3 ; smooth
                     F8_TAPS -1,  3,  -6,  17, 125, -13,  5, -2
                     F8_TAPS  0,  1,  -3,   8, 127,  -7,  3, -1
 
+pw_11585x2: times 4 dw 23170
+pw_6270x2:  times 4 dw 12540
+pw_15137x2: times 4 dw 30274
+pw_t2_coef: dw -15137,  6270, -15137,  6270
+pw_t3_coef: dw   6270, 15137,   6270, 15137
+pd_round:   times 2 dd 1<<13
+pw_2048:    times 4 dw 2048
+
 SECTION .text
 
+;-------------------------------------------------------------------------------------------
+; void vp9_idct_idct_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;-------------------------------------------------------------------------------------------
+
+; (a*x + b*y + round) >> shift
+%macro VP9_MULSUB_2W_2X 6 ; dst1, dst2, src (unchanged), round, coefs1, coefs2
+    movq                m%1, [%5]
+    movq                m%2, [%6]
+    pmaddwd             m%1, m%3
+    pmaddwd             m%2, m%3
+    paddd               m%1, m%4
+    paddd               m%2, m%4
+    psrad               m%1, 14
+    psrad               m%2, 14
+%endmacro
+
+%macro VP9_IDCT4_1D 0
+    SUMSUB_BA           w, 2, 0, 4
+    movq                m4, [pw_11585x2]
+    pmulhrsw            m0, m4                              ; m0=t1
+    pmulhrsw            m2, m4                              ; m2=t0
+    movq                m6, m3
+    punpckhwd           m3, m1
+    VP9_MULSUB_2W_2X     4, 5, 3, 7, pw_t2_coef, pw_t3_coef
+    punpcklwd           m6, m1
+    VP9_MULSUB_2W_2X     1, 3, 6, 7, pw_t2_coef, pw_t3_coef
+    packssdw            m1, m4                              ; m1=t2
+    packssdw            m3, m5                              ; m3=t3
+    SUMSUB_BA            w, 3, 2, 4                         ; m3=t3+t0, m2=-t3+t0
+    SUMSUB_BA            w, 1, 0, 4                         ; m1=t2+t1, m0=-t2+t1
+    SWAP                 0, 3                               ; 3102 -> 0132
+    SWAP                 3, 2                               ; 0132 -> 0123
+%endmacro
+
+%macro VP9_IDCT2_1D 0
+    pmulhrsw            m0, m5                              ; m0=t1
+    movq                m2, m0                              ; m2=t0
+    movq                m3, m1
+    pmulhrsw            m1, m6                              ; m1=t2
+    pmulhrsw            m3, m7                              ; m3=t3
+    SUMSUB_BA            w, 3, 2, 4                         ; m3=t3+t0, m2=-t3+t0
+    SUMSUB_BA            w, 1, 0, 4                         ; m1=t2+t1, m0=-t2+t1
+    SWAP                 0, 3                               ; 3102 -> 0132
+    SWAP                 3, 2                               ; 0132 -> 0123
+%endmacro
+
+%macro VP9_STORE_2X 2
+    movd                m6, [dstq]
+    movd                m7, [dstq+strideq]
+    punpcklbw           m6, m4
+    punpcklbw           m7, m4
+    paddw               m6, %1
+    paddw               m7, %2
+    packuswb            m6, m4
+    packuswb            m7, m4
+    movd            [dstq], m6
+    movd    [dstq+strideq], m7
+%endmacro
+
+INIT_MMX ssse3
+cglobal vp9_idct_idct_4x4_add, 4,4,0, dst, stride, block, eob
+
+    cmp eobd, 4 ; 2x2 or smaller
+    jg .idctfull
+
+    cmp eobd, 1 ; dc only
+    jne .idct2x2
+    movd                m0, [blockq]
+    movq                m5, [pw_11585x2]
+    pmulhrsw            m0, m5
+    pmulhrsw            m0, m5
+    pshufw              m0, m0, 0
+    pxor                m4, m4
+    movq          [blockq], m4
+    movq                m5, [pw_2048]
+    pmulhrsw            m0, m5              ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4
+    VP9_STORE_2X        m0, m0
+    lea               dstq, [dstq+2*strideq]
+    VP9_STORE_2X        m0, m0
+    RET
+
+.idct2x2:
+    movd                m0, [blockq+0]
+    movd                m1, [blockq+8]
+    movq                m5, [pw_11585x2]
+    movq                m6, [pw_6270x2]
+    movq                m7, [pw_15137x2]
+
+    VP9_IDCT2_1D
+    TRANSPOSE4x4W  0, 1, 2, 3, 4
+    VP9_IDCT2_1D
+
+    pxor                m4, m4  ; used for the block reset, and VP9_STORE_2X
+    movq       [blockq+ 0], m4
+    movq       [blockq+ 8], m4
+
+    movq                m5, [pw_2048]
+    pmulhrsw            m0, m5              ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4
+    pmulhrsw            m1, m5
+    VP9_STORE_2X        m0, m1
+    lea               dstq, [dstq+2*strideq]
+    pmulhrsw            m2, m5
+    pmulhrsw            m3, m5
+    VP9_STORE_2X        m2, m3
+    RET
+
+.idctfull:
+    movq                m0, [blockq+ 0]
+    movq                m1, [blockq+ 8]
+    movq                m2, [blockq+16]
+    movq                m3, [blockq+24]
+
+    movq                m7, [pd_round]
+    VP9_IDCT4_1D
+    TRANSPOSE4x4W  0, 1, 2, 3, 4
+    VP9_IDCT4_1D
+
+    pxor                m4, m4  ; used for the block reset, and VP9_STORE_2X
+    movq       [blockq+ 0], m4
+    movq       [blockq+ 8], m4
+    movq       [blockq+16], m4
+    movq       [blockq+24], m4
+
+    movq                m5, [pw_2048]
+    pmulhrsw            m0, m5              ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4
+    pmulhrsw            m1, m5
+    VP9_STORE_2X        m0, m1
+    lea               dstq, [dstq+2*strideq]
+    pmulhrsw            m2, m5
+    pmulhrsw            m3, m5
+    VP9_STORE_2X        m2, m3
+    RET
+
 %macro filter_h_fn 1
 %assign %%px mmsize/2
 cglobal %1_8tap_1d_h_ %+ %%px, 6, 6, 11, dst, dstride, src, sstride, h, filtery
diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c
index cf7a1a4..d131598 100644
--- a/libavcodec/x86/vp9dsp_init.c
+++ b/libavcodec/x86/vp9dsp_init.c
@@ -150,6 +150,8 @@ filters_8tap_1d_fn3(avg)
 #undef filters_8tap_1d_fn3
 #undef filter_8tap_1d_fn
 
+void ff_vp9_idct_idct_4x4_add_ssse3(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+
 #endif /* HAVE_YASM */
 
 av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
@@ -203,6 +205,7 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
     if (cpu_flags & AV_CPU_FLAG_SSSE3) {
         init_subpel3(0, put, ssse3);
         init_subpel3(1, avg, ssse3);
+        dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_ssse3;
     }
 
 #undef init_fpel
-- 
1.8.4.1



More information about the ffmpeg-devel mailing list