[FFmpeg-devel] [PATCH] dnxhdenc: get_pixels_8x4_sym_10bit_sse2

Timothy Gu timothygu99 at gmail.com
Wed Apr 9 05:42:56 CEST 2014


Before:
3383 decicycles in dnxhd_10bit_get_pixels_8x4_sym, 130910 runs, 162 skips
After:
750 decicycles in ff_get_pixels_8x4_sym_10bit_sse2, 130999 runs, 73 skips

Overall performance impact negligible.

Signed-off-by: Timothy Gu <timothygu99 at gmail.com>
---
 libavcodec/x86/dnxhdenc.asm    | 41 +++++++++++++++++++++++++++++------------
 libavcodec/x86/dnxhdenc_init.c |  4 ++++
 2 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/libavcodec/x86/dnxhdenc.asm b/libavcodec/x86/dnxhdenc.asm
index 9dd6d51..d42530b 100644
--- a/libavcodec/x86/dnxhdenc.asm
+++ b/libavcodec/x86/dnxhdenc.asm
@@ -26,18 +26,30 @@ section .text
 
 ; void get_pixels_8x4_sym_sse2(int16_t *block, const uint8_t *pixels,
 ;                              ptrdiff_t line_size)
-INIT_XMM sse2
-cglobal get_pixels_8x4_sym, 3,3,5, block, pixels, linesize
-    pxor      m4,       m4
-    movq      m0,       [pixelsq]
-    add       pixelsq,  linesizeq
-    movq      m1,       [pixelsq]
-    movq      m2,       [pixelsq+linesizeq]
-    movq      m3,       [pixelsq+linesizeq*2]
-    punpcklbw m0,       m4
-    punpcklbw m1,       m4
-    punpcklbw m2,       m4
-    punpcklbw m3,       m4
+
+%macro GET_PIXELS 1
+%if %1 == 8
+cglobal get_pixels_8x4_sym,       3,3,5, block, pixels, linesize
+%elif %1 == 16
+cglobal get_pixels_8x4_sym_10bit, 3,3,4, block, pixels, linesize
+%endif
+    %if %1 == mmsize/2
+        pxor        m4, m4
+        %define LOAD movh
+    %elif %1 == mmsize && %1 == 16
+        %define LOAD movu
+    %endif
+    LOAD            m0, [pixelsq]
+    add        pixelsq, linesizeq
+    LOAD            m1, [pixelsq]
+    LOAD            m2, [pixelsq+linesizeq]
+    LOAD            m3, [pixelsq+linesizeq*2]
+    %if %1 == mmsize/2
+        punpcklbw   m0, m4
+        punpcklbw   m1, m4
+        punpcklbw   m2, m4
+        punpcklbw   m3, m4
+    %endif
     mova  [blockq    ], m0
     mova  [blockq+16 ], m1
     mova  [blockq+32 ], m2
@@ -47,3 +59,8 @@ cglobal get_pixels_8x4_sym, 3,3,5, block, pixels, linesize
     mova  [blockq+96 ], m1
     mova  [blockq+112], m0
     RET
+%endmacro
+
+INIT_XMM sse2
+GET_PIXELS 8
+GET_PIXELS 16
diff --git a/libavcodec/x86/dnxhdenc_init.c b/libavcodec/x86/dnxhdenc_init.c
index 3b02264..c1c4a8b 100644
--- a/libavcodec/x86/dnxhdenc_init.c
+++ b/libavcodec/x86/dnxhdenc_init.c
@@ -27,6 +27,8 @@
 
 void ff_get_pixels_8x4_sym_sse2(int16_t *block, const uint8_t *pixels,
                                 ptrdiff_t line_size);
+void ff_get_pixels_8x4_sym_10bit_sse2(int16_t *block, const uint8_t *pixels,
+                                      ptrdiff_t line_size);
 
 av_cold void ff_dnxhdenc_init_x86(DNXHDEncContext *ctx)
 {
@@ -34,6 +36,8 @@ av_cold void ff_dnxhdenc_init_x86(DNXHDEncContext *ctx)
     if (EXTERNAL_SSE2(av_get_cpu_flags())) {
         if (ctx->cid_table->bit_depth == 8)
             ctx->get_pixels_8x4_sym = ff_get_pixels_8x4_sym_sse2;
+        else if (ctx->cid_table->bit_depth == 10)
+            ctx->get_pixels_8x4_sym = ff_get_pixels_8x4_sym_10bit_sse2;
     }
 #endif /* HAVE_SSE2_EXTERNAL */
 }
-- 
1.8.3.2



More information about the ffmpeg-devel mailing list