[FFmpeg-devel] [PATCH 4/4] avcodec/h264: sse2, avx h luma mbaff deblock/loop filter
James Darnley
jdarnley at obe.tv
Mon Feb 13 14:44:17 EET 2017
x86-64 only
Yorkfield:
- sse2: 2.16x (434 vs. 201 cycles)
Skylake:
- sse2: 3.04x (378 vs. 124 cycles)
- avx: 3.29x (378 vs. 115 cycles)
---
libavcodec/x86/h264_deblock.asm | 119 ++++++++++++++++++++++++++++++++++++++++
libavcodec/x86/h264dsp_init.c | 10 ++++
2 files changed, 129 insertions(+)
diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm
index 509a0dbe0c..f47a199e8f 100644
--- a/libavcodec/x86/h264_deblock.asm
+++ b/libavcodec/x86/h264_deblock.asm
@@ -377,10 +377,129 @@ cglobal deblock_h_luma_8, 5,9,0,0x60+16*WIN64
RET
%endmacro
+; TODO: use macro arguments
+%macro TRANSPOSE_8X8B_XMM 8
+ punpcklbw m0, m1
+ punpcklbw m2, m3
+ punpcklbw m4, m5
+ punpcklbw m6, m7
+
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+
+ punpckhwd m5, m4, m6
+ punpcklwd m4, m6
+
+ punpckhdq m2, m0, m4
+ punpckldq m0, m4
+
+ punpckhdq m6, m1, m5
+ punpckldq m1, m5
+
+ MOVHL m4, m0
+ MOVHL m3, m2
+ MOVHL m7, m6
+ MOVHL m5, m1
+ SWAP 1, 4
+%endmacro
+
+%macro TRANSPOSE_8X8B_XMM 0
+ TRANSPOSE_8X8B_XMM 0, 1, 2, 3, 4, 5, 6, 7
+%endmacro
+
+%macro DEBLOCK_H_LUMA_MBAFF 0
+
+cglobal deblock_h_luma_mbaff_8, 5, 9, 10, 8*16, pix_, stride_, alpha_, beta_, tc0_
+ movsxd stride_q, stride_d
+ dec alpha_d
+ dec beta_d
+ mov r5, pix_q
+ lea r6, [3*stride_q]
+ add r5, r6
+
+ movq m0, [pix_q - 4]
+ movq m1, [pix_q + stride_q - 4]
+ movq m2, [pix_q + 2*stride_q - 4]
+ movq m3, [r5 - 4]
+ movq m4, [r5 + stride_q - 4]
+ movq m5, [r5 + 2*stride_q - 4]
+ movq m6, [r5 + r6 - 4]
+ movq m7, [r5 +4*stride_q - 4]
+
+ TRANSPOSE_8X8B_XMM
+
+ %assign i 0
+ %rep 8
+ movq [rsp + 16*i], m %+ i
+ %assign i i+1
+ %endrep
+
+ ; p2 = m1 [rsp + 16]
+ ; p1 = m2 [rsp + 32]
+ ; p0 = m3 [rsp + 48]
+ ; q0 = m4 [rsp + 64]
+ ; q1 = m5 [rsp + 80]
+ ; q2 = m6 [rsp + 96]
+
+ SWAP 0, 2
+ SWAP 1, 3
+ SWAP 2, 4
+ SWAP 3, 5
+
+ LOAD_MASK alpha_d, beta_d
+ movd m8, [tc0_q]
+ punpcklbw m8, m8
+ pcmpeqb m9, m9
+ pcmpeqb m9, m8
+ pandn m9, m7
+ pand m8, m9
+
+ movdqa m3, [rsp + 16] ; p2
+ DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
+ pand m6, m9
+ psubb m7, m8, m6
+ pand m6, m8
+ LUMA_Q1 m0, m3, [rsp + 16], [rsp + 32], m6, m4
+
+ movdqa m4, [rsp + 96] ; q2
+ DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
+ pand m6, m9
+ pand m8, m6
+ psubb m7, m6
+ mova m3, [rsp + 80]
+ LUMA_Q1 m3, m4, [rsp + 96], [rsp + 80], m8, m6
+
+ DEBLOCK_P0_Q0
+ SWAP 1, 3
+ SWAP 2, 4
+ movq m0, [rsp]
+ movq m1, [rsp + 16]
+ movq m2, [rsp + 32]
+ movq m5, [rsp + 80]
+ movq m6, [rsp + 96]
+ movq m7, [rsp + 112]
+
+ TRANSPOSE_8X8B_XMM
+ movq [pix_q - 4], m0
+ movq [pix_q + stride_q - 4], m1
+ movq [pix_q + 2*stride_q - 4], m2
+ movq [r5 - 4], m3
+ movq [r5 + stride_q - 4], m4
+ movq [r5 + 2*stride_q - 4], m5
+ movq [r5 + r6 - 4], m6
+ movq [r5 +4*stride_q - 4], m7
+
+RET
+
+%endmacro
+
INIT_XMM sse2
+DEBLOCK_H_LUMA_MBAFF
DEBLOCK_LUMA
+
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
+DEBLOCK_H_LUMA_MBAFF
DEBLOCK_LUMA
%endif
diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c
index 7b3d17f971..10f19401ef 100644
--- a/libavcodec/x86/h264dsp_init.c
+++ b/libavcodec/x86/h264dsp_init.c
@@ -137,6 +137,9 @@ LF_IFUNC(h, chroma422_intra, depth, avx) \
LF_FUNC(v, chroma, depth, avx) \
LF_IFUNC(v, chroma_intra, depth, avx)
+LF_FUNC(h, luma_mbaff, 8, sse2)
+LF_FUNC(h, luma_mbaff, 8, avx)
+
LF_FUNCS(uint8_t, 8)
LF_FUNCS(uint16_t, 10)
@@ -297,6 +300,10 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_sse2;
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2;
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2;
+
+#if ARCH_X86_64
+ c->h264_h_loop_filter_luma_mbaff = ff_deblock_h_luma_mbaff_8_sse2;
+#endif
}
if (EXTERNAL_SSSE3(cpu_flags)) {
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_ssse3;
@@ -307,6 +314,9 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_avx;
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx;
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_avx;
+#if ARCH_X86_64
+ c->h264_h_loop_filter_luma_mbaff = ff_deblock_h_luma_mbaff_8_avx;
+#endif
}
} else if (bit_depth == 10) {
if (EXTERNAL_MMXEXT(cpu_flags)) {
--
2.11.1
More information about the ffmpeg-devel
mailing list