[FFmpeg-cvslog] hevc: cleanups in SSE2 and SSSE3 loop filters, use fewer instructions

Anton Khirnov git at videolan.org
Tue Jul 22 16:26:30 CEST 2014


ffmpeg | branch: master | Anton Khirnov <anton at khirnov.net> | Sat Jul 19 14:18:03 2014 +0200| [b435043abb3653004e5ffa8f66d686f227d07cfe] | committer: Michael Niedermayer

hevc: cleanups in SSE2 and SSSE3 loop filters, use fewer instructions

cherry picked from commit f7843356253459e6010320292dbbc1e888a5249b
Signed-off-by: Michael Niedermayer <michaelni at gmx.at>

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=b435043abb3653004e5ffa8f66d686f227d07cfe
---

 libavcodec/x86/hevc_deblock.asm |   28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/libavcodec/x86/hevc_deblock.asm b/libavcodec/x86/hevc_deblock.asm
index 65ec796..1c13655 100644
--- a/libavcodec/x86/hevc_deblock.asm
+++ b/libavcodec/x86/hevc_deblock.asm
@@ -715,10 +715,9 @@ cglobal hevc_h_loop_filter_chroma_8, 3, 4, 7, pix, stride, tc, pix0
     punpcklbw        m2, m5
     punpcklbw        m3, m5
     CHROMA_DEBLOCK_BODY  8
-    packuswb         m1, m1 ; p0' packed in bytes on low quadword
-    packuswb         m2, m2 ; q0' packed in bytes on low quadword
-    movq [pix0q+strideq], m1
-    movq         [pixq], m2
+    packuswb         m1, m2
+    movh[pix0q+strideq], m1
+    movhps       [pixq], m1
     RET
 
 cglobal hevc_h_loop_filter_chroma_10, 3, 4, 7, pix, stride, tc, pix0
@@ -793,18 +792,15 @@ cglobal hevc_h_loop_filter_luma_8, 4, 15, 16, pix, stride, beta, tc, count, pix0
     punpcklbw        m7, m8
     LUMA_DEBLOCK_BODY 8, h
 .store:
-    packuswb         m1, m1; p2
-    packuswb         m2, m2; p1
-    packuswb         m3, m3; p0
-    packuswb         m4, m4; q0
-    packuswb         m5, m5; q1
-    packuswb         m6, m6; q2
-    movq        [r5+r1], m1;  p2
-    movq      [r5+2*r1], m2;  p1
-    movq        [r5+r6], m3;  p0
-    movq           [r0], m4;  q0
-    movq        [r0+r1], m5;  q1
-    movq      [r0+2*r1], m6;  q2
+    packuswb          m1, m2
+    packuswb          m3, m4
+    packuswb          m5, m6
+    movh   [r5 +     r1], m1
+    movhps [r5 + 2 * r1], m1
+    movh   [r5 +     r6], m3
+    movhps [r0         ], m3
+    movh   [r0 +     r1], m5
+    movhps [r0 + 2 * r1], m5
 .bypassluma:
     RET
 



More information about the ffmpeg-cvslog mailing list