[FFmpeg-cvslog] diracdsp: add SIMD for the 10 bit version of put_signed_rect_clamped

Rostislav Pehlivanov git at videolan.org
Tue Jul 12 00:45:35 CEST 2016


ffmpeg | branch: master | Rostislav Pehlivanov <rpehlivanov at ob-encoder.com> | Thu Jun 23 18:06:55 2016 +0100| [bd61f3c6bfb83d7691e124a02394ae76737c26f4] | committer: Rostislav Pehlivanov

diracdsp: add SIMD for the 10 bit version of put_signed_rect_clamped

Signed-off-by: Rostislav Pehlivanov <rpehlivanov at obe.tv>

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=bd61f3c6bfb83d7691e124a02394ae76737c26f4
---

 libavcodec/x86/diracdsp.asm    |   42 ++++++++++++++++++++++++++++++++++++++++
 libavcodec/x86/diracdsp_init.c |    4 ++++
 2 files changed, 46 insertions(+)

diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm
index 8e9f0fb..d86b543 100644
--- a/libavcodec/x86/diracdsp.asm
+++ b/libavcodec/x86/diracdsp.asm
@@ -22,6 +22,8 @@
 
 SECTION_RODATA
 pw_7: times 8 dw 7
+convert_to_unsigned_10bit: times 4 dd 0x200
+clip_10bit:                times 8 dw 0x3ff
 
 cextern pw_3
 cextern pw_16
@@ -300,3 +302,43 @@ cglobal dequant_subband_32, 7, 7, 4, src, dst, stride, qf, qs, tot_v, tot_h
     jg     .loop_v
 
     RET
+
+%if ARCH_X86_64 == 1
+; void put_signed_rect_clamped_10(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height)
+cglobal put_signed_rect_clamped_10, 6, 9, 6, dst, dst_stride, src, src_stride, w, h
+    mov      r6, srcq
+    mov      r7, dstq
+    mov      r8, wq
+    pxor     m2, m2
+    mova     m3, [clip_10bit]
+    mova     m4, [convert_to_unsigned_10bit]
+
+    .loop_h:
+    mov      srcq, r6
+    mov      dstq, r7
+    mov      wq,   r8
+
+    .loop_w:
+    movu     m0, [srcq+0*mmsize]
+    movu     m1, [srcq+1*mmsize]
+
+    paddd    m0, m4
+    paddd    m1, m4
+    packusdw m0, m0, m1
+    CLIPW    m0, m2, m3 ; packusdw saturates so it's fine
+
+    movu     [dstq], m0
+
+    add      srcq, 2*mmsize
+    add      dstq, 1*mmsize
+    sub      wd, 8
+    jg       .loop_w
+
+    add      r6, src_strideq
+    add      r7, dst_strideq
+    sub      hd, 1
+    jg       .loop_h
+
+    RET
+
+%endif
diff --git a/libavcodec/x86/diracdsp_init.c b/libavcodec/x86/diracdsp_init.c
index 26b885d..43aab6a 100644
--- a/libavcodec/x86/diracdsp_init.c
+++ b/libavcodec/x86/diracdsp_init.c
@@ -45,6 +45,9 @@ void ff_put_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, i
 void ff_put_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
 void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
 void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
+#if ARCH_X86_64
+void ff_put_signed_rect_clamped_10_sse4(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height);
+#endif
 
 void ff_dequant_subband_32_sse4(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int tot_v, int tot_h);
 
@@ -189,5 +192,6 @@ void ff_diracdsp_init_x86(DiracDSPContext* c)
 
     if (EXTERNAL_SSE4(mm_flags)) {
         c->dequant_subband[1]         = ff_dequant_subband_32_sse4;
+        c->put_signed_rect_clamped[1] = ff_put_signed_rect_clamped_10_sse4;
     }
 }



More information about the ffmpeg-cvslog mailing list