[FFmpeg-devel] [PATCH 4/6] cavs: add a sse2 idct implementation.

Tue Apr 4 19:48:16 EEST 2017

This makes using the function pointer ff_add_pixels_clamped() unnecessary,
since we always know what the best implementation is at compile-time.
---
 libavcodec/x86/cavsdsp.c    | 15 +++++++++++++-
 libavcodec/x86/cavsidct.asm | 48 ++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 61 insertions(+), 2 deletions(-)

diff --git a/libavcodec/x86/cavsdsp.c b/libavcodec/x86/cavsdsp.c
index add4536..a8a198b 100644
--- a/libavcodec/x86/cavsdsp.c
+++ b/libavcodec/x86/cavsdsp.c
@@ -29,6 +29,7 @@
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/cavsdsp.h"
 #include "libavcodec/idctdsp.h"
+#include "libavcodec/x86/idctdsp.h"
 #include "constants.h"
 #include "fpel.h"
 #include "idctdsp.h"
@@ -43,7 +44,16 @@ static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, ptrdiff_t stride)
 {
     LOCAL_ALIGNED(16, int16_t, b2, [64]);
     ff_cavs_idct8_mmx(b2, block);
-    ff_add_pixels_clamped(b2, dst, stride);
+    ff_add_pixels_clamped_mmx(b2, dst, stride);
+}
+
+void ff_cavs_idct8_sse2(int16_t *out, const int16_t *in);
+
+static void cavs_idct8_add_sse2(uint8_t *dst, int16_t *block, ptrdiff_t stride)
+{
+    LOCAL_ALIGNED(16, int16_t, b2, [64]);
+    ff_cavs_idct8_sse2(b2, block);
+    ff_add_pixels_clamped_sse2(b2, dst, stride);
 }
 
 #endif /* HAVE_MMX_EXTERNAL */
@@ -446,6 +456,9 @@ av_cold void ff_cavsdsp_init_x86(CAVSDSPContext *c, AVCodecContext *avctx)
     if (EXTERNAL_SSE2(cpu_flags)) {
         c->put_cavs_qpel_pixels_tab[0][0] = put_cavs_qpel16_mc00_sse2;
         c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_sse2;
+
+        c->cavs_idct8_add = cavs_idct8_add_sse2;
+        c->idct_perm      = FF_IDCT_PERM_TRANSPOSE;
     }
 #endif
 }
diff --git a/libavcodec/x86/cavsidct.asm b/libavcodec/x86/cavsidct.asm
index 5421196..99b505d 100644
--- a/libavcodec/x86/cavsidct.asm
+++ b/libavcodec/x86/cavsidct.asm
@@ -29,11 +29,16 @@ cextern pw_64
 
 SECTION .text
 
-%macro CAVS_IDCT8_1D 2 ; source, round
+%macro CAVS_IDCT8_1D 2-3 1 ; source, round, init_load
+%if %3 == 1
     mova            m4, [%1+7*16]       ; m4 = src7
     mova            m5, [%1+1*16]       ; m5 = src1
     mova            m2, [%1+5*16]       ; m2 = src5
     mova            m7, [%1+3*16]       ; m7 = src3
+%else
+    SWAP             1, 7
+    SWAP             4, 6
+%endif
     mova            m0, m4
     mova            m3, m5
     mova            m6, m2
@@ -163,3 +168,44 @@ cglobal cavs_idct8, 2, 4, 8, 8 * 16, out, in, cnt, tmp
     jg .loop_2
 
     RET
+
+INIT_XMM sse2
+cglobal cavs_idct8, 2, 2, 8, 0 - 8 * 16, out, in
+    CAVS_IDCT8_1D  inq, [pw_4]
+    psraw           m7, 3
+    psraw           m6, 3
+    psraw           m5, 3
+    psraw           m4, 3
+    psraw           m3, 3
+    psraw           m2, 3
+    psraw           m1, 3
+    psraw           m0, 3
+%if ARCH_X86_64
+    TRANSPOSE8x8W    7, 5, 3, 1, 0, 2, 4, 6, 8
+    mova    [rsp+4*16], m0
+%else
+    mova    [rsp+0*16], m4
+    TRANSPOSE8x8W    7, 5, 3, 1, 0, 2, 4, 6, [rsp+0*16], [rsp+4*16], 1
+%endif
+    mova    [rsp+0*16], m7
+    mova    [rsp+2*16], m3
+    mova    [rsp+6*16], m4
+    CAVS_IDCT8_1D  rsp, [pw_64], 0
+    psraw           m7, 7
+    psraw           m6, 7
+    psraw           m5, 7
+    psraw           m4, 7
+    psraw           m3, 7
+    psraw           m2, 7
+    psraw           m1, 7
+    psraw           m0, 7
+
+    mova   [outq+0*16], m7
+    mova   [outq+1*16], m5
+    mova   [outq+2*16], m3
+    mova   [outq+3*16], m1
+    mova   [outq+4*16], m0
+    mova   [outq+5*16], m2
+    mova   [outq+6*16], m4
+    mova   [outq+7*16], m6
+    RET
-- 
2.8.1