[FFmpeg-devel] [PATCH] arm/vp9/WIP: add ff_vp9_{put, avg}{8, 16, 32, 64}.

Sun Mar 30 15:35:13 CEST 2014

---
Honestly, nothing speed relevant; this is mostly as an exercise to get used to
the ASM.

There is at least one thing that bother me; the sub instructions in
ff_vp9_{put,avg}{32,64}. If you see a way of avoiding them with some syntaxic
trick in the loads, I'd be happy to hear about it. If it's possible to do
without these subs, there is probably a way to avoid the r4 copy of r0, and
thus save some stack save overhead.
---
 libavcodec/arm/Makefile          |   2 +
 libavcodec/arm/vp9dsp_init_arm.c |  66 +++++++++++++++
 libavcodec/arm/vp9mc_neon.S      | 179 +++++++++++++++++++++++++++++++++++++++
 libavcodec/vp9dsp.c              |   1 +
 libavcodec/vp9dsp.h              |   1 +
 5 files changed, 249 insertions(+)
 create mode 100644 libavcodec/arm/vp9dsp_init_arm.c
 create mode 100644 libavcodec/arm/vp9mc_neon.S

diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index 8ca9021..fca4ce2 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -33,6 +33,7 @@ OBJS-$(CONFIG_VC1_DECODER)             += arm/vc1dsp_init_arm.o
 OBJS-$(CONFIG_VORBIS_DECODER)          += arm/vorbisdsp_init_arm.o
 OBJS-$(CONFIG_VP6_DECODER)             += arm/vp6dsp_init_arm.o
 OBJS-$(CONFIG_VP8_DECODER)             += arm/vp8dsp_init_arm.o
+OBJS-$(CONFIG_VP9_DECODER)             += arm/vp9dsp_init_arm.o
 OBJS-$(CONFIG_RV30_DECODER)            += arm/rv34dsp_init_arm.o
 OBJS-$(CONFIG_RV40_DECODER)            += arm/rv34dsp_init_arm.o        \
                                           arm/rv40dsp_init_arm.o
@@ -105,3 +106,4 @@ NEON-OBJS-$(CONFIG_VORBIS_DECODER)     += arm/vorbisdsp_neon.o
 NEON-OBJS-$(CONFIG_VP6_DECODER)        += arm/vp6dsp_neon.o
 NEON-OBJS-$(CONFIG_VP8_DECODER)        += arm/vp8dsp_init_neon.o        \
                                           arm/vp8dsp_neon.o
+NEON-OBJS-$(CONFIG_VP9_DECODER)        += arm/vp9mc_neon.o
diff --git a/libavcodec/arm/vp9dsp_init_arm.c b/libavcodec/arm/vp9dsp_init_arm.c
new file mode 100644
index 0000000..47689f5
--- /dev/null
+++ b/libavcodec/arm/vp9dsp_init_arm.c
@@ -0,0 +1,66 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/arm/cpu.h"
+#include "libavcodec/vp9dsp.h"
+
+#include "libavutil/log.h"
+
+#define fpel_func(avg, sz, opt) \
+void ff_vp9_##avg##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
+                              const uint8_t *src, ptrdiff_t src_stride, \
+                              int h, int mx, int my)
+fpel_func(put,  4, neon);
+fpel_func(put,  8, neon);
+fpel_func(put, 16, neon);
+fpel_func(put, 32, neon);
+fpel_func(put, 64, neon);
+fpel_func(avg,  4, neon);
+fpel_func(avg,  8, neon);
+fpel_func(avg, 16, neon);
+fpel_func(avg, 32, neon);
+fpel_func(avg, 64, neon);
+#undef fpel_func
+
+av_cold void ff_vp9dsp_init_arm(VP9DSPContext *dsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+#define init_fpel(idx1, idx2, sz, type, opt) \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][0][0] = ff_vp9_##type##sz##_##opt
+
+    if (have_neon(cpu_flags)) {
+        //init_fpel(4, 0,  4, put, neon);
+        init_fpel(3, 0,  8, put, neon);
+        init_fpel(2, 0, 16, put, neon);
+        init_fpel(1, 0, 32, put, neon);
+        init_fpel(0, 0, 64, put, neon);
+
+        //init_fpel(4, 1,  4, avg, neon);
+        init_fpel(3, 1,  8, avg, neon);
+        init_fpel(2, 1, 16, avg, neon);
+        init_fpel(1, 1, 32, avg, neon);
+        init_fpel(0, 1, 64, avg, neon);
+    }
+}
diff --git a/libavcodec/arm/vp9mc_neon.S b/libavcodec/arm/vp9mc_neon.S
new file mode 100644
index 0000000..c54cc7f
--- /dev/null
+++ b/libavcodec/arm/vp9mc_neon.S
@@ -0,0 +1,179 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+function ff_vp9_put8_neon, export=1
+        ldr             r12, [sp, #0]
+1:
+        subs            r12, r12, #4
+        vld1.8          {d0}, [r2], r3
+        vld1.8          {d1}, [r2], r3
+        vld1.8          {d2}, [r2], r3
+        vld1.8          {d3}, [r2], r3
+        vst1.8          {d0}, [r0:64], r1
+        vst1.8          {d1}, [r0:64], r1
+        vst1.8          {d2}, [r0:64], r1
+        vst1.8          {d3}, [r0:64], r1
+        bgt             1b
+        bx              lr
+endfunc
+
+function ff_vp9_avg8_neon, export=1
+        push            {r4, lr}
+        ldr             r12, [sp, #8]
+        mov             r4, r0
+1:
+        subs            r12, r12, #4
+        vld1.8          {d0}, [r2], r3
+        vld1.8          {d1}, [r2], r3
+        vld1.8          {d2}, [r2], r3
+        vld1.8          {d3}, [r2], r3
+        vld1.8          {d4}, [r4:64], r1
+        vld1.8          {d5}, [r4:64], r1
+        vld1.8          {d6}, [r4:64], r1
+        vld1.8          {d7}, [r4:64], r1
+        vrhadd.u8       q0, q0, q2
+        vrhadd.u8       q1, q1, q3
+        vst1.8          {d0}, [r0:64], r1
+        vst1.8          {d1}, [r0:64], r1
+        vst1.8          {d2}, [r0:64], r1
+        vst1.8          {d3}, [r0:64], r1
+        bgt             1b
+        pop             {r4, pc}
+endfunc
+
+function ff_vp9_put16_neon, export=1
+        ldr             r12, [sp, #0]
+1:
+        subs            r12, r12, #4
+        vld1.8          {q0}, [r2], r3
+        vld1.8          {q1}, [r2], r3
+        vld1.8          {q2}, [r2], r3
+        vld1.8          {q3}, [r2], r3
+        vst1.8          {q0}, [r0:128], r1
+        vst1.8          {q1}, [r0:128], r1
+        vst1.8          {q2}, [r0:128], r1
+        vst1.8          {q3}, [r0:128], r1
+        bgt             1b
+        bx              lr
+endfunc
+
+function ff_vp9_avg16_neon, export=1
+        ldr             r12, [sp, #0]
+        push            {r4, lr}
+        mov             r4, r0
+1:
+        subs            r12, r12, #2
+        vld1.8          {q0}, [r2], r3
+        vld1.8          {q1}, [r2], r3
+        vld1.8          {q2}, [r4:128], r1
+        vld1.8          {q3}, [r4:128], r1
+        vrhadd.u8       q0, q0, q2
+        vrhadd.u8       q1, q1, q3
+        vst1.8          {q0}, [r0:128], r1
+        vst1.8          {q1}, [r0:128], r1
+        bgt             1b
+        pop             {r4, pc}
+endfunc
+
+function ff_vp9_put32_neon, export=1
+        ldr             r12, [sp, #0]
+        sub             r3, #16
+        sub             r1, #16
+1:
+        subs            r12, r12, #2
+        vld1.8          {q0}, [r2]!
+        vld1.8          {q1}, [r2], r3
+        vld1.8          {q2}, [r2]!
+        vld1.8          {q3}, [r2], r3
+        vst1.8          {q0}, [r0:128]!
+        vst1.8          {q1}, [r0:128], r1
+        vst1.8          {q2}, [r0:128]!
+        vst1.8          {q3}, [r0:128], r1
+        bgt             1b
+        bx              lr
+endfunc
+
+function ff_vp9_avg32_neon, export=1
+        ldr             r12, [sp, #0]
+        push            {r4, lr}
+        mov             r4, r0
+        sub             r3, #16
+        sub             r1, #16
+1:
+        subs            r12, r12, #1
+        vld1.8          {q0}, [r2]!
+        vld1.8          {q1}, [r2], r3
+        vld1.8          {q2}, [r4:128]!
+        vld1.8          {q3}, [r4:128], r1
+        vrhadd.u8       q0, q0, q2
+        vrhadd.u8       q1, q1, q3
+        vst1.8          {q0}, [r0:128]!
+        vst1.8          {q1}, [r0:128], r1
+        bgt             1b
+        pop             {r4, pc}
+endfunc
+
+function ff_vp9_put64_neon, export=1
+        ldr             r12, [sp, #0]
+        sub             r3, #48
+        sub             r1, #48
+1:
+        subs            r12, r12, #1
+        vld1.8          {q0}, [r2]!
+        vld1.8          {q1}, [r2]!
+        vld1.8          {q2}, [r2]!
+        vld1.8          {q3}, [r2], r3
+        vst1.8          {q0}, [r0:128]!
+        vst1.8          {q1}, [r0:128]!
+        vst1.8          {q2}, [r0:128]!
+        vst1.8          {q3}, [r0:128], r1
+        bgt             1b
+        bx              lr
+endfunc
+
+function ff_vp9_avg64_neon, export=1
+        ldr             r12, [sp, #0]
+        push            {r4, lr}
+        vpush           {q4-q7}
+        mov             r4, r0
+        sub             r3, #48
+        sub             r1, #48
+1:
+        subs            r12, r12, #1
+        vld1.8          {q0}, [r2]!
+        vld1.8          {q1}, [r2]!
+        vld1.8          {q2}, [r2]!
+        vld1.8          {q3}, [r2], r3
+        vld1.8          {q4}, [r4:128]!
+        vld1.8          {q5}, [r4:128]!
+        vld1.8          {q6}, [r4:128]!
+        vld1.8          {q7}, [r4:128], r1
+        vrhadd.u8       q0, q0, q4
+        vrhadd.u8       q1, q1, q5
+        vrhadd.u8       q2, q2, q6
+        vrhadd.u8       q3, q3, q7
+        vst1.8          {q0}, [r0:128]!
+        vst1.8          {q1}, [r0:128]!
+        vst1.8          {q2}, [r0:128]!
+        vst1.8          {q3}, [r0:128], r1
+        bgt             1b
+        vpop            {q4-q7}
+        pop             {r4, pc}
+endfunc
diff --git a/libavcodec/vp9dsp.c b/libavcodec/vp9dsp.c
index e2f99f7..2ee749f 100644
--- a/libavcodec/vp9dsp.c
+++ b/libavcodec/vp9dsp.c
@@ -2061,4 +2061,5 @@ av_cold void ff_vp9dsp_init(VP9DSPContext *dsp)
     vp9dsp_mc_init(dsp);
 
     if (ARCH_X86) ff_vp9dsp_init_x86(dsp);
+    if (ARCH_ARM) ff_vp9dsp_init_arm(dsp);
 }
diff --git a/libavcodec/vp9dsp.h b/libavcodec/vp9dsp.h
index db0a92e..a335173 100644
--- a/libavcodec/vp9dsp.h
+++ b/libavcodec/vp9dsp.h
@@ -114,5 +114,6 @@ typedef struct VP9DSPContext {
 void ff_vp9dsp_init(VP9DSPContext *dsp);
 
 void ff_vp9dsp_init_x86(VP9DSPContext *dsp);
+void ff_vp9dsp_init_arm(VP9DSPContext *dsp);
 
 #endif /* AVCODEC_VP9DSP_H */
-- 
1.9.1