[FFmpeg-devel] [PATCH 10/11] avcodec/x86: add an 8-bit simple IDCT function based on the x86-64 high depth functions

Mon Jun 19 18:11:03 EEST 2017

Includes add/put functions

Rounding contributed by Ronald S. Bultje
---
 libavcodec/tests/x86/dct.c                |  2 +
 libavcodec/x86/idctdsp_init.c             | 23 ++++++++
 libavcodec/x86/simple_idct.h              |  9 +++
 libavcodec/x86/simple_idct10.asm          | 92 +++++++++++++++++++++++++++++++
 libavcodec/x86/simple_idct10_template.asm |  6 +-
 5 files changed, 130 insertions(+), 2 deletions(-)

diff --git a/libavcodec/tests/x86/dct.c b/libavcodec/tests/x86/dct.c
index 34f5b8767b..317d973f9f 100644
--- a/libavcodec/tests/x86/dct.c
+++ b/libavcodec/tests/x86/dct.c
@@ -88,10 +88,12 @@ static const struct algo idct_tab_arch[] = {
 #if HAVE_YASM
 #if ARCH_X86_64
 #if HAVE_SSE2_EXTERNAL
+    { "SIMPLE8-SSE2",   ff_simple_idct8_sse2,  FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_SSE2},
     { "SIMPLE10-SSE2",  ff_simple_idct10_sse2, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_SSE2},
     { "SIMPLE12-SSE2",  ff_simple_idct12_sse2, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_SSE2, 1 },
 #endif
 #if HAVE_AVX_EXTERNAL
+    { "SIMPLE8-AVX",    ff_simple_idct8_avx,   FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_AVX},
     { "SIMPLE10-AVX",   ff_simple_idct10_avx,  FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_AVX},
     { "SIMPLE12-AVX",   ff_simple_idct12_avx,  FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_AVX,  1 },
 #endif
diff --git a/libavcodec/x86/idctdsp_init.c b/libavcodec/x86/idctdsp_init.c
index f1c915aa00..9da60d1a1e 100644
--- a/libavcodec/x86/idctdsp_init.c
+++ b/libavcodec/x86/idctdsp_init.c
@@ -94,9 +94,32 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
                 c->idct_add  = ff_simple_idct_add_sse2;
                 c->perm_type = FF_IDCT_PERM_SIMPLE;
         }
+
+        if (ARCH_X86_64 &&
+            !high_bit_depth &&
+            avctx->lowres == 0 &&
+            (avctx->idct_algo == FF_IDCT_AUTO ||
+                avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
+                avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
+                c->idct      = ff_simple_idct8_sse2;
+                c->idct_put  = ff_simple_idct8_put_sse2;
+                c->idct_add  = ff_simple_idct8_add_sse2;
+                c->perm_type = FF_IDCT_PERM_TRANSPOSE;
+        }
     }
 
     if (ARCH_X86_64 && avctx->lowres == 0) {
+        if (EXTERNAL_AVX(cpu_flags) &&
+            !high_bit_depth &&
+            (avctx->idct_algo == FF_IDCT_AUTO ||
+                avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
+                avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
+                c->idct      = ff_simple_idct8_avx;
+                c->idct_put  = ff_simple_idct8_put_avx;
+                c->idct_add  = ff_simple_idct8_add_avx;
+                c->perm_type = FF_IDCT_PERM_TRANSPOSE;
+        }
+
         if (avctx->bits_per_raw_sample == 10 &&
             (avctx->idct_algo == FF_IDCT_AUTO ||
              avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
diff --git a/libavcodec/x86/simple_idct.h b/libavcodec/x86/simple_idct.h
index d17ef6a462..9b64cfe9bc 100644
--- a/libavcodec/x86/simple_idct.h
+++ b/libavcodec/x86/simple_idct.h
@@ -29,6 +29,15 @@ void ff_simple_idct_put_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
 void ff_simple_idct_add_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
 void ff_simple_idct_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
 
+void ff_simple_idct8_sse2(int16_t *block);
+void ff_simple_idct8_avx(int16_t *block);
+
+void ff_simple_idct8_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_simple_idct8_put_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+
+void ff_simple_idct8_add_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_simple_idct8_add_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+
 void ff_simple_idct10_sse2(int16_t *block);
 void ff_simple_idct10_avx(int16_t *block);
 
diff --git a/libavcodec/x86/simple_idct10.asm b/libavcodec/x86/simple_idct10.asm
index b492303a57..069bb61378 100644
--- a/libavcodec/x86/simple_idct10.asm
+++ b/libavcodec/x86/simple_idct10.asm
@@ -31,11 +31,14 @@ SECTION_RODATA
 
 cextern pw_2
 cextern pw_16
+cextern pw_32
 cextern pw_1023
 cextern pw_4095
+pd_round_11: times 4 dd 1<<(11-1)
 pd_round_12: times 4 dd 1<<(12-1)
 pd_round_15: times 4 dd 1<<(15-1)
 pd_round_19: times 4 dd 1<<(19-1)
+pd_round_20: times 4 dd 1<<(20-1)
 
 %macro CONST_DEC  3
 const %1
@@ -77,8 +80,97 @@ CONST_DEC  w3_min_w7_lo,    W3sh2_lo, -W7sh2
 
 SECTION .text
 
+%macro STORE_HI_LO 12
+    movq   %1, %9
+    movq   %3, %10
+    movq   %5, %11
+    movq   %7, %12
+    movhps %2, %9
+    movhps %4, %10
+    movhps %6, %11
+    movhps %8, %12
+%endmacro
+
+%macro LOAD_ZXBW_8 16
+    pmovzxbw %1, %9
+    pmovzxbw %2, %10
+    pmovzxbw %3, %11
+    pmovzxbw %4, %12
+    pmovzxbw %5, %13
+    pmovzxbw %6, %14
+    pmovzxbw %7, %15
+    pmovzxbw %8, %16
+%endmacro
+
+%macro LOAD_ZXBW_4 9
+    movh %1, %5
+    movh %2, %6
+    movh %3, %7
+    movh %4, %8
+    punpcklbw %1, %9
+    punpcklbw %2, %9
+    punpcklbw %3, %9
+    punpcklbw %4, %9
+%endmacro
+
+%define PASS4ROWS(base, stride, stride3) \
+    [base], [base + stride], [base + 2*stride], [base + stride3]
+
 %macro idct_fn 0
 
+define_constants _lo
+
+cglobal simple_idct8, 1, 1, 16, 32, block
+    IDCT_FN    "", 11, pw_32, 20, "store"
+RET
+
+cglobal simple_idct8_put, 3, 4, 16, 32, pixels, lsize, block
+    IDCT_FN    "", 11, pw_32, 20
+    lea       r3, [3*lsizeq]
+    lea       r2, [pixelsq + r3]
+    packuswb  m8, m0
+    packuswb  m1, m2
+    packuswb  m4, m11
+    packuswb  m9, m10
+    STORE_HI_LO PASS8ROWS(pixelsq, r2, lsizeq, r3), m8, m1, m4, m9
+RET
+
+cglobal simple_idct8_add, 3, 4, 16, 32, pixels, lsize, block
+    IDCT_FN    "", 11, pw_32, 20
+    lea r2, [3*lsizeq]
+    %if cpuflag(sse4)
+        lea r3, [pixelsq + r2]
+        LOAD_ZXBW_8 m3, m5, m6, m7, m12, m13, m14, m15, PASS8ROWS(pixelsq, r3, lsizeq, r2)
+        paddsw m8, m3
+        paddsw m0, m5
+        paddsw m1, m6
+        paddsw m2, m7
+        paddsw m4, m12
+        paddsw m11, m13
+        paddsw m9, m14
+        paddsw m10, m15
+    %else
+        pxor m12, m12
+        LOAD_ZXBW_4 m3, m5, m6, m7, PASS4ROWS(pixelsq, lsizeq, r2), m12
+        paddsw m8, m3
+        paddsw m0, m5
+        paddsw m1, m6
+        paddsw m2, m7
+        lea r3, [pixelsq + 4*lsizeq]
+        LOAD_ZXBW_4 m3, m5, m6, m7, PASS4ROWS(r3, lsizeq, r2), m12
+        paddsw m4, m3
+        paddsw m11, m5
+        paddsw m9, m6
+        paddsw m10, m7
+        lea r3, [pixelsq + r2]
+    %endif
+    packuswb  m8, m0
+    packuswb  m1, m2
+    packuswb  m4, m11
+    packuswb  m9, m10
+    STORE_HI_LO PASS8ROWS(pixelsq, r3, lsizeq, r2), m8, m1, m4, m9
+RET
+
 define_constants _hi
 
 cglobal simple_idct10, 1, 1, 16, block
diff --git a/libavcodec/x86/simple_idct10_template.asm b/libavcodec/x86/simple_idct10_template.asm
index 51baf84c82..02fd445ec0 100644
--- a/libavcodec/x86/simple_idct10_template.asm
+++ b/libavcodec/x86/simple_idct10_template.asm
@@ -258,6 +258,10 @@
 
     IDCT_1D     %1, %2, %8
 %elif %2 == 11
+    ; This copies the DC-only shortcut.  When there is only a DC coefficient the
+    ; C shifts the value and splats it to all coeffs rather than multiplying and
+    ; doing the full IDCT.  This causes a difference on 8-bit because the
+    ; coefficient is 16383 rather than 16384 (which you can get with shifting).
     por     m1, m8, m13
     por     m1, m12
     por     m1, [blockq+ 16]       ; { row[1] }[0-7]
@@ -293,8 +297,6 @@
     por  m9, m6
     pand m10, m5
     por  m10, m6
-    pand m3, m5
-    por  m3, m6
 %else
     IDCT_1D     %1, %2
 %endif
-- 
2.13.1