[FFmpeg-cvslog] avcodec/x86/dsp: add_int16_mmx / add_int16_sse2

Mon Jan 20 04:10:08 CET 2014

ffmpeg | branch: master | Michael Niedermayer <michaelni at gmx.at> | Mon Jan 20 03:51:21 2014 +0100| [a493f8541de20e76073433f39f66da31f3834bc4] | committer: Michael Niedermayer

avcodec/x86/dsp: add_int16_mmx / add_int16_sse2

Signed-off-by: Michael Niedermayer <michaelni at gmx.at>

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=a493f8541de20e76073433f39f66da31f3834bc4
---

 libavcodec/x86/dsputil.asm    |   65 +++++++++++++++++++++++++++++++++++++++++
 libavcodec/x86/dsputil_init.c |    3 ++
 libavcodec/x86/dsputil_x86.h  |    2 ++
 3 files changed, 70 insertions(+)

diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm
index 77069e2..9450cd8 100644
--- a/libavcodec/x86/dsputil.asm
+++ b/libavcodec/x86/dsputil.asm
@@ -465,6 +465,71 @@ cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left
 .src_unaligned:
     ADD_HFYU_LEFT_LOOP 0, 0
 
+
+%macro ADD_INT16_LOOP 1 ; %1 = is_aligned
+    movd      m4, maskq
+    punpcklwd m4, m4
+    punpcklwd m4, m4
+    punpcklwd m4, m4
+    add     wq, wq
+    test    wq, 2*mmsize - 1
+    jz %%.tomainloop
+%%.wordloop:
+    sub     wq, 2
+    mov     ax, [srcq+wq]
+    add     ax, [dstq+wq]
+    and     ax, maskw
+    mov     [dstq+wq], ax
+    test    wq, 2*mmsize - 1
+    jnz %%.wordloop
+%%.tomainloop:
+    add     srcq, wq
+    add     dstq, wq
+    neg     wq
+    jz      %%.end
+%%.loop:
+%if %1
+    mova    m0, [srcq+wq]
+    mova    m1, [dstq+wq]
+    mova    m2, [srcq+wq+mmsize]
+    mova    m3, [dstq+wq+mmsize]
+%else
+    movu    m0, [srcq+wq]
+    movu    m1, [dstq+wq]
+    movu    m2, [srcq+wq+mmsize]
+    movu    m3, [dstq+wq+mmsize]
+%endif
+    paddw   m0, m1
+    paddw   m2, m3
+    pand    m0, m4
+    pand    m2, m4
+%if %1
+    mova    [dstq+wq]       , m0
+    mova    [dstq+wq+mmsize], m2
+%else
+    movu    [dstq+wq]       , m0
+    movu    [dstq+wq+mmsize], m2
+%endif
+    add     wq, 2*mmsize
+    jl %%.loop
+%%.end:
+    RET
+%endmacro
+
+INIT_MMX mmx
+cglobal add_int16, 4,4,5, dst, src, mask, w
+    ADD_INT16_LOOP 1
+
+INIT_XMM sse2
+cglobal add_int16, 4,4,5, dst, src, mask, w
+    test srcq, mmsize-1
+    jnz .unaligned
+    test dstq, mmsize-1
+    jnz .unaligned
+    ADD_INT16_LOOP 1
+.unaligned:
+    ADD_INT16_LOOP 0
+
 ;-----------------------------------------------------------------------------
 ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
 ;                           int32_t max, unsigned int len)
diff --git a/libavcodec/x86/dsputil_init.c b/libavcodec/x86/dsputil_init.c
index e0b4041..08bd297 100644
--- a/libavcodec/x86/dsputil_init.c
+++ b/libavcodec/x86/dsputil_init.c
@@ -542,6 +542,7 @@ static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
 #endif /* HAVE_MMX_INLINE */
 
 #if HAVE_MMX_EXTERNAL
+    c->add_int16 = ff_add_int16_mmx;
     c->vector_clip_int32 = ff_vector_clip_int32_mmx;
 #endif /* HAVE_MMX_EXTERNAL */
 }
@@ -625,6 +626,8 @@ static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
         c->vector_clip_int32 = ff_vector_clip_int32_sse2;
     }
     c->bswap_buf = ff_bswap32_buf_sse2;
+
+    c->add_int16 = ff_add_int16_sse2;
 #endif /* HAVE_SSE2_EXTERNAL */
 }
 
diff --git a/libavcodec/x86/dsputil_x86.h b/libavcodec/x86/dsputil_x86.h
index 356b2c1..e707e55 100644
--- a/libavcodec/x86/dsputil_x86.h
+++ b/libavcodec/x86/dsputil_x86.h
@@ -116,6 +116,8 @@ void ff_clear_blocks_mmx(int16_t *blocks);
 void ff_clear_blocks_sse(int16_t *blocks);
 
 void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w);
+void ff_add_int16_mmx(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
+void ff_add_int16_sse2(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
 
 void ff_add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
                                         const uint8_t *diff, int w,