[FFmpeg-cvslog] avcodec/x86/lossless_videodsp : add avx2 version for add_left_pred

Martin Vignali git at videolan.org
Sat Dec 9 16:17:47 EET 2017


ffmpeg | branch: master | Martin Vignali <martin.vignali at gmail.com> | Sat Dec  2 19:09:58 2017 +0100| [4353c3506742c9fecce4cf9f68cc6a7ab7ea05b1] | committer: Martin Vignali

avcodec/x86/lossless_videodsp : add avx2 version for add_left_pred

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=4353c3506742c9fecce4cf9f68cc6a7ab7ea05b1
---

 libavcodec/x86/lossless_videodsp.asm    | 63 +++++++++++++++++++++------------
 libavcodec/x86/lossless_videodsp_init.c |  3 ++
 2 files changed, 44 insertions(+), 22 deletions(-)

diff --git a/libavcodec/x86/lossless_videodsp.asm b/libavcodec/x86/lossless_videodsp.asm
index 663bf6153e..cfa0620fd1 100644
--- a/libavcodec/x86/lossless_videodsp.asm
+++ b/libavcodec/x86/lossless_videodsp.asm
@@ -114,40 +114,54 @@ MEDIAN_PRED
     add     dstq, wq
     neg     wq
 %%.loop:
+    pshufb  xm0, xm5
 %if %2
     mova    m1, [srcq+wq]
 %else
     movu    m1, [srcq+wq]
 %endif
-    mova    m2, m1
-    psllw   m1, 8
+    psllw   m2, m1, 8
     paddb   m1, m2
-    mova    m2, m1
-    pshufb  m1, m3
+    pshufb  m2, m1, m3
     paddb   m1, m2
-    pshufb  m0, m5
-    mova    m2, m1
-    pshufb  m1, m4
+    pshufb  m2, m1, m4
     paddb   m1, m2
-%if mmsize == 16
-    mova    m2, m1
-    pshufb  m1, m6
+%if mmsize >= 16
+    pshufb  m2, m1, m6
     paddb   m1, m2
 %endif
-    paddb   m0, m1
+    paddb   xm0, xm1
 %if %1
-    mova    [dstq+wq], m0
+    mova    [dstq+wq], xm0
 %else
-    movq    [dstq+wq], m0
-    movhps  [dstq+wq+8], m0
+    movq    [dstq+wq], xm0
+    movhps  [dstq+wq+8], xm0
+%endif
+
+%if mmsize == 32
+    vextracti128    xm2, m1, 1 ; get second lane of the ymm
+    pshufb          xm0, xm5   ; set alls val to last val of the first lane
+    paddb           xm0, xm2
+;store val
+%if %1
+    mova    [dstq+wq+16], xm0
+%else;
+    movq    [dstq+wq+16], xm0
+    movhps  [dstq+wq+16+8], xm0
+%endif
 %endif
     add     wq, mmsize
     jl %%.loop
+%if mmsize == 32
+    mov    eax, [dstq -1]
+    and    eax, 0xff
+%else;
     mov     eax, mmsize-1
     sub     eax, wd
     movd    m1, eax
     pshufb  m0, m1
     movd    eax, m0
+%endif
     RET
 %endmacro
 
@@ -166,15 +180,15 @@ cglobal add_left_pred, 3,3,7, dst, src, w, left
 
 %macro ADD_LEFT_PRED_UNALIGNED 0
 cglobal add_left_pred_unaligned, 3,3,7, dst, src, w, left
-    mova    m5, [pb_15]
-    mova    m6, [pb_zzzzzzzz77777777]
-    mova    m4, [pb_zzzz3333zzzzbbbb]
-    mova    m3, [pb_zz11zz55zz99zzdd]
-    movd    m0, leftm
-    pslldq  m0, 15
-    test    srcq, 15
+    mova    xm5, [pb_15]
+    VBROADCASTI128    m6, [pb_zzzzzzzz77777777]
+    VBROADCASTI128    m4, [pb_zzzz3333zzzzbbbb]
+    VBROADCASTI128    m3, [pb_zz11zz55zz99zzdd]
+    movd    xm0, leftm
+    pslldq  xm0, 15
+    test    srcq, mmsize - 1
     jnz .src_unaligned
-    test    dstq, 15
+    test    dstq, mmsize - 1
     jnz .dst_unaligned
     ADD_LEFT_LOOP 1, 1
 .dst_unaligned:
@@ -186,6 +200,11 @@ cglobal add_left_pred_unaligned, 3,3,7, dst, src, w, left
 INIT_XMM ssse3
 ADD_LEFT_PRED_UNALIGNED
 
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+ADD_LEFT_PRED_UNALIGNED
+%endif
+
 ;------------------------------------------------------------------------------
 ; void ff_add_bytes(uint8_t *dst, uint8_t *src, ptrdiff_t w);
 ;------------------------------------------------------------------------------
diff --git a/libavcodec/x86/lossless_videodsp_init.c b/libavcodec/x86/lossless_videodsp_init.c
index 4f20c1ce92..beae317cc2 100644
--- a/libavcodec/x86/lossless_videodsp_init.c
+++ b/libavcodec/x86/lossless_videodsp_init.c
@@ -38,6 +38,8 @@ int  ff_add_left_pred_ssse3(uint8_t *dst, const uint8_t *src,
                             ptrdiff_t w, int left);
 int  ff_add_left_pred_unaligned_ssse3(uint8_t *dst, const uint8_t *src,
                                       ptrdiff_t w, int left);
+int  ff_add_left_pred_unaligned_avx2(uint8_t *dst, const uint8_t *src,
+                                     ptrdiff_t w, int left);
 
 int ff_add_left_pred_int16_ssse3(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc);
 int ff_add_left_pred_int16_sse4(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc);
@@ -118,5 +120,6 @@ void ff_llviddsp_init_x86(LLVidDSPContext *c)
     }
     if (EXTERNAL_AVX2_FAST(cpu_flags)) {
         c->add_bytes       = ff_add_bytes_avx2;
+        c->add_left_pred   = ff_add_left_pred_unaligned_avx2;
     }
 }



More information about the ffmpeg-cvslog mailing list