[FFmpeg-devel] [PATCH 2/6] avcodec/h264: add avx 8-bit 4:2:0 chroma h deblock/loop filter
James Darnley
jdarnley at obe.tv
Mon Feb 20 17:33:20 EET 2017
~1.14x faster (93 vs. 81 cycles) compared with mmxext function
---
libavcodec/x86/h264_deblock.asm | 70 +++++++++++++++++++++++++++++++++++++++++
libavcodec/x86/h264dsp_init.c | 3 ++
2 files changed, 73 insertions(+)
diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm
index 2e84ca3..0465c9f 100644
--- a/libavcodec/x86/h264_deblock.asm
+++ b/libavcodec/x86/h264_deblock.asm
@@ -1059,6 +1059,54 @@ ff_chroma_intra_body_mmxext:
paddb m2, m6
ret
+%macro LOAD_8_ROWS 8
+ movd m0, %1
+ movd m1, %2
+ movd m2, %3
+ movd m3, %4
+ movd m4, %5
+ movd m5, %6
+ movd m6, %7
+ movd m7, %8
+%endmacro
+
+%macro STORE_8_ROWS 8
+ movd %1, m0
+ movd %2, m1
+ movd %3, m2
+ movd %4, m3
+ movd %5, m4
+ movd %6, m5
+ movd %7, m6
+ movd %8, m7
+%endmacro
+
+%macro TRANSPOSE_8x4B_XMM 0
+ punpcklbw m0, m1
+ punpcklbw m2, m3
+ punpcklbw m4, m5
+ punpcklbw m6, m7
+ punpcklwd m0, m2
+ punpcklwd m4, m6
+ punpckhdq m2, m0, m4
+ punpckldq m0, m4
+ MOVHL m1, m0
+ MOVHL m3, m2
+%endmacro
+
+%macro TRANSPOSE_4x8B_XMM 0
+ punpcklbw m0, m1
+ punpcklbw m2, m3
+ punpckhwd m4, m0, m2
+ punpcklwd m0, m2
+ MOVHL m6, m4
+ MOVHL m2, m0
+ pshufd m1, m0, 1
+ pshufd m3, m2, 1
+ pshufd m5, m4, 1
+ pshufd m7, m6, 1
+%endmacro
+
%macro CHROMA_INTER_BODY_XMM 1
LOAD_MASK alpha_d, beta_d
movd m6, [tc0_q]
@@ -1078,6 +1126,15 @@ ff_chroma_intra_body_mmxext:
sub %1, stride_q
%endmacro
+%macro CHROMA_H_START_XMM 2
+ movsxdifnidn stride_q, stride_d
+ dec alpha_d
+ dec beta_d
+ lea %2, [3*stride_q]
+ mov %1, pix_q
+ add %1, %2
+%endmacro
+
%macro DEBLOCK_CHROMA_XMM 1
INIT_XMM %1
@@ -1093,6 +1150,19 @@ cglobal deblock_v_chroma_8, 5, 6, 8, pix_, stride_, alpha_, beta_, tc0_
movq [pix_q], m2
RET
+cglobal deblock_h_chroma_8, 5, 7, 8, 0-16, pix_, stride_, alpha_, beta_, tc0_
+ CHROMA_H_START_XMM r5, r6
+ LOAD_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6)
+ TRANSPOSE_8x4B_XMM
+ movq [rsp], m0
+ movq [rsp + 8], m3
+ CHROMA_INTER_BODY_XMM 1
+ movq m0, [rsp]
+ movq m3, [rsp + 8]
+ TRANSPOSE_4x8B_XMM
+ STORE_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6)
+RET
+
%endmacro ; DEBLOCK_CHROMA_XMM
DEBLOCK_CHROMA_XMM avx
diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c
index 6794aa5..0b15471 100644
--- a/libavcodec/x86/h264dsp_init.c
+++ b/libavcodec/x86/h264dsp_init.c
@@ -319,6 +319,9 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
#endif
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_8_avx;
+ if (chroma_format_idc <= 1) {
+ c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_8_avx;
+ }
}
} else if (bit_depth == 10) {
if (EXTERNAL_MMXEXT(cpu_flags)) {
--
2.8.3
More information about the ffmpeg-devel
mailing list