[FFmpeg-devel] [PATCH 3/6] add and fix xmm version of simple_idct
James Darnley
jdarnley at obe.tv
Sat Jun 3 03:18:06 EEST 2017
---
libavcodec/tests/x86/dct.c | 3 +++
libavcodec/x86/idctdsp_init.c | 1 +
libavcodec/x86/simple_idct.asm | 45 ++++++++++++++++++++++++++++++++++++++++++
libavcodec/x86/simple_idct.h | 1 +
4 files changed, 50 insertions(+)
diff --git a/libavcodec/tests/x86/dct.c b/libavcodec/tests/x86/dct.c
index 34f5b8767b..97116570f4 100644
--- a/libavcodec/tests/x86/dct.c
+++ b/libavcodec/tests/x86/dct.c
@@ -97,6 +97,9 @@ static const struct algo idct_tab_arch[] = {
#endif
#endif
#endif
+#if HAVE_SSE2_EXTERNAL
+ { "SIMPLE-SSE2", ff_simple_idct_sse2, FF_IDCT_PERM_SIMPLE, AV_CPU_FLAG_SSE2 },
+#endif
{ 0 }
};
diff --git a/libavcodec/x86/idctdsp_init.c b/libavcodec/x86/idctdsp_init.c
index f1c915aa00..82530a5cc4 100644
--- a/libavcodec/x86/idctdsp_init.c
+++ b/libavcodec/x86/idctdsp_init.c
@@ -92,6 +92,7 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
c->idct_put = ff_simple_idct_put_sse2;
c->idct_add = ff_simple_idct_add_sse2;
+ c->idct = ff_simple_idct_sse2;
c->perm_type = FF_IDCT_PERM_SIMPLE;
}
}
diff --git a/libavcodec/x86/simple_idct.asm b/libavcodec/x86/simple_idct.asm
index 3b62a4f9d3..a6eb42464b 100644
--- a/libavcodec/x86/simple_idct.asm
+++ b/libavcodec/x86/simple_idct.asm
@@ -151,6 +151,10 @@ SECTION .text
psrad m2, %7
packssdw m7, m1 ; A1+B1 a1+b1 A0+B0 a0+b0
packssdw m2, m4 ; A0-B0 a0-b0 A1-B1 a1-b1
+%if mmsize == 16
+pshufd m7, m7, 8
+pshufd m2, m2, 8
+%endif
movq [%5], m7
movq m1, [blockq + %3] ; R3 R1 r3 r1
movq m4, [coeffs + 80] ; -C1 C5 -C1 C5
@@ -172,9 +176,15 @@ SECTION .text
psubd m4, m3 ; a3-B3 a3-b3
psrad m6, %7
packssdw m2, m6 ; A3+B3 a3+b3 A2+B2 a2+b2
+%if mmsize == 16
+pshufd m2, m2, 8
+%endif
movq [8 + %5], m2
psrad m4, %7
packssdw m4, m0 ; A2-B2 a2-b2 A3-B3 a3-b3
+%if mmsize == 16
+pshufd m4, m4, 8
+%endif
movq [16 + %5], m4
jmp %%2
%%1:
@@ -182,6 +192,9 @@ SECTION .text
paddd m0, [d40000]
psrad m0, 13
packssdw m0, m0
+%if mmsize == 16
+pshufd m0, m0, 8
+%endif
movq [%5], m0
movq [8 + %5], m0
movq [16 + %5], m0
@@ -239,6 +252,10 @@ SECTION .text
psrad m2, %7
packssdw m7, m1 ; A1+B1 a1+b1 A0+B0 a0+b0
packssdw m2, m4 ; A0-B0 a0-b0 A1-B1 a1-b1
+%if mmsize == 16
+pshufd m7, m7, 8
+pshufd m2, m2, 8
+%endif
movq [%5], m7
movq m1, [blockq + %3] ; R3 R1 r3 r1
movq m4, [coeffs + 80] ; -C1 C5 -C1 C5
@@ -260,9 +277,15 @@ SECTION .text
psubd m4, m3 ; a3-B3 a3-b3
psrad m6, %7
packssdw m2, m6 ; A3+B3 a3+b3 A2+B2 a2+b2
+%if mmsize == 16
+pshufd m2, m2, 8
+%endif
movq [8 + %5], m2
psrad m4, %7
packssdw m4, m0 ; A2-B2 a2-b2 A3-B3 a3-b3
+%if mmsize == 16
+pshufd m4, m4, 8
+%endif
movq [16 + %5], m4
%endmacro
@@ -614,9 +637,15 @@ SECTION .text
psrad m7, %6
psrad m3, %6
packssdw m4, m7 ; A0 a0
+%if mmsize == 16
+pshufd m4, m4, q0020
+%endif
movq [%5], m4
psrad m0, %6
packssdw m0, m3 ; A1 a1
+%if mmsize == 16
+pshufd m0, m0, q0020
+%endif
movq [16 + %5], m0
movq [96 + %5], m0
movq [112 + %5], m4
@@ -624,9 +653,15 @@ SECTION .text
psrad m6, %6
psrad m2, %6
packssdw m5, m2 ; A2-B2 a2-b2
+%if mmsize == 16
+pshufd m5, m5, q0020
+%endif
movq [32 + %5], m5
psrad m1, %6
packssdw m6, m1 ; A3+B3 a3+b3
+%if mmsize == 16
+pshufd m6, m6, q0020
+%endif
movq [48 + %5], m6
movq [64 + %5], m6
movq [80 + %5], m5
@@ -711,9 +746,15 @@ SECTION .text
movq m7, [coeffs + 32] ; C6 C2 C6 C2
psrad m1, %6
packssdw m4, m1 ; A0 a0
+%if mmsize == 16
+pshufd m4, m4, 8
+%endif
movq [%5], m4
psrad m2, %6
packssdw m0, m2 ; A1 a1
+%if mmsize == 16
+pshufd m0, m0, 8
+%endif
movq [16 + %5], m0
movq [96 + %5], m0
movq [112 + %5], m4
@@ -889,6 +930,10 @@ RET
INIT_XMM sse2
+cglobal simple_idct, 1, 2, 8, 128, block, t0
+ IDCT
+RET
+
cglobal simple_idct_put, 3, 5, 8, 128, pixels, lsize, block, lsize3, t0
IDCT
lea lsize3q, [lsizeq*3]
diff --git a/libavcodec/x86/simple_idct.h b/libavcodec/x86/simple_idct.h
index d17ef6a462..b19e910372 100644
--- a/libavcodec/x86/simple_idct.h
+++ b/libavcodec/x86/simple_idct.h
@@ -26,6 +26,7 @@ void ff_simple_idct_mmx(int16_t *block);
void ff_simple_idct_add_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
void ff_simple_idct_put_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_simple_idct_sse2(int16_t *block);
void ff_simple_idct_add_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
void ff_simple_idct_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
--
2.12.2
More information about the ffmpeg-devel
mailing list