[FFmpeg-cvslog] aarch64: h264 loop filter NEON optimizations

Wed Jan 15 15:36:16 CET 2014

ffmpeg | branch: master | Janne Grunau <janne-libav at jannau.net> | Fri Dec 20 21:02:43 2013 +0100| [36e3b1f2fd262028834a9d7b1eb533c1218ee6c2] | committer: Janne Grunau

aarch64: h264 loop filter NEON optimizations

Ported from ARMv7 NEON.

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=36e3b1f2fd262028834a9d7b1eb533c1218ee6c2
---

 libavcodec/aarch64/Makefile               |    3 +-
 libavcodec/aarch64/h264dsp_init_aarch64.c |   14 ++
 libavcodec/aarch64/h264dsp_neon.S         |  259 +++++++++++++++++++++++++++++
 libavcodec/aarch64/neon.S                 |   24 +++
 4 files changed, 299 insertions(+), 1 deletion(-)

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index aa1dd4e..93a8d0c 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -6,7 +6,8 @@ OBJS-$(CONFIG_RV40_DECODER)             += aarch64/rv40dsp_init_aarch64.o
 OBJS-$(CONFIG_VC1_DECODER)              += aarch64/vc1dsp_init_aarch64.o
 
 NEON-OBJS-$(CONFIG_H264CHROMA)          += aarch64/h264cmc_neon.o
-NEON-OBJS-$(CONFIG_H264DSP)             += aarch64/h264idct_neon.o
+NEON-OBJS-$(CONFIG_H264DSP)             += aarch64/h264dsp_neon.o              \
+                                           aarch64/h264idct_neon.o
 NEON-OBJS-$(CONFIG_H264QPEL)            += aarch64/h264qpel_neon.o             \
                                            aarch64/hpeldsp_neon.o
 NEON-OBJS-$(CONFIG_HPELDSP)             += aarch64/hpeldsp_neon.o
diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c
index 8148336..307b30c 100644
--- a/libavcodec/aarch64/h264dsp_init_aarch64.c
+++ b/libavcodec/aarch64/h264dsp_init_aarch64.c
@@ -25,6 +25,15 @@
 #include "libavutil/aarch64/cpu.h"
 #include "libavcodec/h264dsp.h"
 
+void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
+                                     int beta, int8_t *tc0);
+void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
+                                     int beta, int8_t *tc0);
+void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
+                                       int beta, int8_t *tc0);
+void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
+                                       int beta, int8_t *tc0);
+
 void ff_h264_idct_add_neon(uint8_t *dst, int16_t *block, int stride);
 void ff_h264_idct_dc_add_neon(uint8_t *dst, int16_t *block, int stride);
 void ff_h264_idct_add16_neon(uint8_t *dst, const int *block_offset,
@@ -49,6 +58,11 @@ av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth,
     int cpu_flags = av_get_cpu_flags();
 
     if (have_neon(cpu_flags) && bit_depth == 8) {
+        c->h264_v_loop_filter_luma   = ff_h264_v_loop_filter_luma_neon;
+        c->h264_h_loop_filter_luma   = ff_h264_h_loop_filter_luma_neon;
+        c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
+        c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
+
         c->h264_idct_add        = ff_h264_idct_add_neon;
         c->h264_idct_dc_add     = ff_h264_idct_dc_add_neon;
         c->h264_idct_add16      = ff_h264_idct_add16_neon;
diff --git a/libavcodec/aarch64/h264dsp_neon.S b/libavcodec/aarch64/h264dsp_neon.S
new file mode 100644
index 0000000..777ddef
--- /dev/null
+++ b/libavcodec/aarch64/h264dsp_neon.S
@@ -0,0 +1,259 @@
+/*
+ * Copyright (c) 2008 Mans Rullgard <mans at mansr.com>
+ * Copyright (c) 2013 Janne Grunau <janne-libav at jannau.net>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#include "neon.S"
+
+.macro  h264_loop_filter_start
+        cmp             w2,  #0
+        ldr             w6,  [x4]
+        ccmp            w3,  #0, #0, ne
+        mov             v24.S[0], w6
+        and             w6,  w6,  w6,  lsl #16
+        b.eq            1f
+        ands            w6,  w6,  w6,  lsl #8
+        b.ge            2f
+1:
+        ret
+2:
+.endm
+
+.macro  h264_loop_filter_luma
+        dup             v22.16B, w2                     // alpha
+        uxtl            v24.8H,  v24.8B
+        uabd            v21.16B, v16.16B, v0.16B        // abs(p0 - q0)
+        uxtl            v24.4S,  v24.4H
+        uabd            v28.16B, v18.16B, v16.16B       // abs(p1 - p0)
+        sli             v24.8H,  v24.8H,  #8
+        uabd            v30.16B, v2.16B,  v0.16B        // abs(q1 - q0)
+        sli             v24.4S,  v24.4S,  #16
+        cmhi            v21.16B, v22.16B, v21.16B       // < alpha
+        dup             v22.16B, w3                     // beta
+        cmlt            v23.16B, v24.16B, #0
+        cmhi            v28.16B, v22.16B, v28.16B       // < beta
+        cmhi            v30.16B, v22.16B, v30.16B       // < beta
+        bic             v21.16B, v21.16B, v23.16B
+        uabd            v17.16B, v20.16B, v16.16B       // abs(p2 - p0)
+        and             v21.16B, v21.16B, v28.16B
+        uabd            v19.16B,  v4.16B,  v0.16B       // abs(q2 - q0)
+        cmhi            v17.16B, v22.16B, v17.16B       // < beta
+        and             v21.16B, v21.16B, v30.16B
+        cmhi            v19.16B, v22.16B, v19.16B       // < beta
+        and             v17.16B, v17.16B, v21.16B
+        and             v19.16B, v19.16B, v21.16B
+        and             v24.16B, v24.16B, v21.16B
+        urhadd          v28.16B, v16.16B,  v0.16B
+        sub             v21.16B, v24.16B, v17.16B
+        uqadd           v23.16B, v18.16B, v24.16B
+        uhadd           v20.16B, v20.16B, v28.16B
+        sub             v21.16B, v21.16B, v19.16B
+        uhadd           v28.16B,  v4.16B, v28.16B
+        umin            v23.16B, v23.16B, v20.16B
+        uqsub           v22.16B, v18.16B, v24.16B
+        uqadd           v4.16B,   v2.16B, v24.16B
+        umax            v23.16B, v23.16B, v22.16B
+        uqsub           v22.16B,  v2.16B, v24.16B
+        umin            v28.16B,  v4.16B, v28.16B
+        uxtl            v4.8H,    v0.8B
+        umax            v28.16B, v28.16B, v22.16B
+        uxtl2           v20.8H,   v0.16B
+        usubw           v4.8H,    v4.8H,  v16.8B
+        usubw2          v20.8H,  v20.8H,  v16.16B
+        shl             v4.8H,    v4.8H,  #2
+        shl             v20.8H,  v20.8H,  #2
+        uaddw           v4.8H,    v4.8H,  v18.8B
+        uaddw2          v20.8H,  v20.8H,  v18.16B
+        usubw           v4.8H,    v4.8H,   v2.8B
+        usubw2          v20.8H,  v20.8H,   v2.16B
+        rshrn           v4.8B,    v4.8H,  #3
+        rshrn2          v4.16B,  v20.8H,  #3
+        bsl             v17.16B, v23.16B, v18.16B
+        bsl             v19.16B, v28.16B,  v2.16B
+        neg             v23.16B, v21.16B
+        uxtl            v28.8H,  v16.8B
+        smin            v4.16B,   v4.16B, v21.16B
+        uxtl2           v21.8H,  v16.16B
+        smax            v4.16B,   v4.16B, v23.16B
+        uxtl            v22.8H,   v0.8B
+        uxtl2           v24.8H,   v0.16B
+        saddw           v28.8H,  v28.8H,  v4.8B
+        saddw2          v21.8H,  v21.8H,  v4.16B
+        ssubw           v22.8H,  v22.8H,  v4.8B
+        ssubw2          v24.8H,  v24.8H,  v4.16B
+        sqxtun          v16.8B,  v28.8H
+        sqxtun2         v16.16B, v21.8H
+        sqxtun          v0.8B,   v22.8H
+        sqxtun2         v0.16B,  v24.8H
+.endm
+
+function ff_h264_v_loop_filter_luma_neon, export=1
+        h264_loop_filter_start
+        sxtw            x1,  w1
+
+        ld1             {v0.16B},  [x0], x1
+        ld1             {v2.16B},  [x0], x1
+        ld1             {v4.16B},  [x0], x1
+        sub             x0,  x0,  x1, lsl #2
+        sub             x0,  x0,  x1, lsl #1
+        ld1             {v20.16B},  [x0], x1
+        ld1             {v18.16B},  [x0], x1
+        ld1             {v16.16B},  [x0], x1
+
+        h264_loop_filter_luma
+
+        sub             x0,  x0,  x1, lsl #1
+        st1             {v17.16B},  [x0], x1
+        st1             {v16.16B}, [x0], x1
+        st1             {v0.16B},  [x0], x1
+        st1             {v19.16B}, [x0]
+
+        ret
+endfunc
+
+function ff_h264_h_loop_filter_luma_neon, export=1
+        h264_loop_filter_start
+
+        sub             x0,  x0,  #4
+        ld1             {v6.8B},  [x0], x1
+        ld1             {v20.8B}, [x0], x1
+        ld1             {v18.8B}, [x0], x1
+        ld1             {v16.8B}, [x0], x1
+        ld1             {v0.8B},  [x0], x1
+        ld1             {v2.8B},  [x0], x1
+        ld1             {v4.8B},  [x0], x1
+        ld1             {v26.8B}, [x0], x1
+        ld1             {v6.D}[1],  [x0], x1
+        ld1             {v20.D}[1], [x0], x1
+        ld1             {v18.D}[1], [x0], x1
+        ld1             {v16.D}[1], [x0], x1
+        ld1             {v0.D}[1],  [x0], x1
+        ld1             {v2.D}[1],  [x0], x1
+        ld1             {v4.D}[1],  [x0], x1
+        ld1             {v26.D}[1], [x0], x1
+
+        transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
+
+        h264_loop_filter_luma
+
+        transpose_4x16B v17, v16, v0, v19, v21, v23, v25, v27
+
+        sub             x0,  x0,  x1, lsl #4
+        add             x0,  x0,  #2
+        st1             {v17.S}[0],  [x0], x1
+        st1             {v16.S}[0], [x0], x1
+        st1             {v0.S}[0],  [x0], x1
+        st1             {v19.S}[0], [x0], x1
+        st1             {v17.S}[1],  [x0], x1
+        st1             {v16.S}[1], [x0], x1
+        st1             {v0.S}[1],  [x0], x1
+        st1             {v19.S}[1], [x0], x1
+        st1             {v17.S}[2],  [x0], x1
+        st1             {v16.S}[2], [x0], x1
+        st1             {v0.S}[2],  [x0], x1
+        st1             {v19.S}[2], [x0], x1
+        st1             {v17.S}[3],  [x0], x1
+        st1             {v16.S}[3], [x0], x1
+        st1             {v0.S}[3],  [x0], x1
+        st1             {v19.S}[3], [x0], x1
+
+        ret
+endfunc
+
+.macro  h264_loop_filter_chroma
+        dup             v22.8B, w2              // alpha
+        uxtl            v24.8H, v24.8B
+        uabd            v26.8B, v16.8B, v0.8B   // abs(p0 - q0)
+        uxtl            v4.8H,  v0.8B
+        uabd            v28.8B, v18.8B, v16.8B  // abs(p1 - p0)
+        usubw           v4.8H,  v4.8H,  v16.8B
+        sli             v24.8H, v24.8H, #8
+        shl             v4.8H,  v4.8H,  #2
+        uabd            v30.8B, v2.8B,  v0.8B   // abs(q1 - q0)
+        uaddw           v4.8H,  v4.8H,  v18.8B
+        cmhi            v26.8B, v22.8B, v26.8B  // < alpha
+        usubw           v4.8H,  v4.8H,  v2.8B
+        dup             v22.8B, w3              // beta
+        rshrn           v4.8B,  v4.8H,  #3
+        cmhi            v28.8B, v22.8B, v28.8B  // < beta
+        cmhi            v30.8B, v22.8B, v30.8B  // < beta
+        smin            v4.8B,  v4.8B,  v24.8B
+        neg             v25.8B, v24.8B
+        and             v26.8B, v26.8B, v28.8B
+        smax            v4.8B,  v4.8B,  v25.8B
+        and             v26.8B, v26.8B, v30.8B
+        uxtl            v22.8H, v0.8B
+        and             v4.8B,  v4.8B,  v26.8B
+        uxtl            v28.8H, v16.8B
+        saddw           v28.8H, v28.8H, v4.8B
+        ssubw           v22.8H, v22.8H, v4.8B
+        sqxtun          v16.8B, v28.8H
+        sqxtun          v0.8B,  v22.8H
+.endm
+
+function ff_h264_v_loop_filter_chroma_neon, export=1
+        h264_loop_filter_start
+
+        sub             x0,  x0,  x1, lsl #1
+        ld1             {v18.8B}, [x0], x1
+        ld1             {v16.8B}, [x0], x1
+        ld1             {v0.8B},  [x0], x1
+        ld1             {v2.8B},  [x0]
+
+        h264_loop_filter_chroma
+
+        sub             x0,  x0,  x1, lsl #1
+        st1             {v16.8B}, [x0], x1
+        st1             {v0.8B},  [x0], x1
+
+        ret
+endfunc
+
+function ff_h264_h_loop_filter_chroma_neon, export=1
+        h264_loop_filter_start
+
+        sub             x0,  x0,  #2
+        ld1             {v18.S}[0], [x0], x1
+        ld1             {v16.S}[0], [x0], x1
+        ld1             {v0.S}[0],  [x0], x1
+        ld1             {v2.S}[0],  [x0], x1
+        ld1             {v18.S}[1], [x0], x1
+        ld1             {v16.S}[1], [x0], x1
+        ld1             {v0.S}[1],  [x0], x1
+        ld1             {v2.S}[1],  [x0], x1
+
+        transpose_4x8B  v18, v16, v0, v2, v28, v29, v30, v31
+
+        h264_loop_filter_chroma
+
+        transpose_4x8B  v18, v16, v0, v2, v28, v29, v30, v31
+
+        sub             x0,  x0,  x1, lsl #3
+        st1             {v18.S}[0], [x0], x1
+        st1             {v16.S}[0], [x0], x1
+        st1             {v0.S}[0],  [x0], x1
+        st1             {v2.S}[0],  [x0], x1
+        st1             {v18.S}[1], [x0], x1
+        st1             {v16.S}[1], [x0], x1
+        st1             {v0.S}[1],  [x0], x1
+        st1             {v2.S}[1],  [x0], x1
+
+        ret
+endfunc
diff --git a/libavcodec/aarch64/neon.S b/libavcodec/aarch64/neon.S
index c4310d7..f1072b7 100644
--- a/libavcodec/aarch64/neon.S
+++ b/libavcodec/aarch64/neon.S
@@ -80,6 +80,30 @@
         trn2            \r7\().4S,  \t1\().4S,  \r7\().4S
 .endm
 
+.macro  transpose_4x16B r0, r1, r2, r3, t4, t5, t6, t7
+        trn1            \t4\().16B, \r0\().16B,  \r1\().16B
+        trn2            \t5\().16B, \r0\().16B,  \r1\().16B
+        trn1            \t6\().16B, \r2\().16B,  \r3\().16B
+        trn2            \t7\().16B, \r2\().16B,  \r3\().16B
+
+        trn1            \r0\().8H,  \t4\().8H,  \t6\().8H
+        trn2            \r2\().8H,  \t4\().8H,  \t6\().8H
+        trn1            \r1\().8H,  \t5\().8H,  \t7\().8H
+        trn2            \r3\().8H,  \t5\().8H,  \t7\().8H
+.endm
+
+.macro  transpose_4x8B  r0, r1, r2, r3, t4, t5, t6, t7
+        trn1            \t4\().8B,  \r0\().8B,  \r1\().8B
+        trn2            \t5\().8B,  \r0\().8B,  \r1\().8B
+        trn1            \t6\().8B,  \r2\().8B,  \r3\().8B
+        trn2            \t7\().8B,  \r2\().8B,  \r3\().8B
+
+        trn1            \r0\().4H,  \t4\().4H,  \t6\().4H
+        trn2            \r2\().4H,  \t4\().4H,  \t6\().4H
+        trn1            \r1\().4H,  \t5\().4H,  \t7\().4H
+        trn2            \r3\().4H,  \t5\().4H,  \t7\().4H
+.endm
+
 .macro  transpose_4x4H  r0, r1, r2, r3, r4, r5, r6, r7
         trn1            \r4\().4H,  \r0\().4H,  \r1\().4H
         trn2            \r5\().4H,  \r0\().4H,  \r1\().4H