[FFmpeg-devel] [PATCH] arm/vp9/WIP: add ff_vp9_{put, avg}{8, 16, 32, 64}.
Clément Bœsch
u at pkh.me
Sun Mar 30 15:35:13 CEST 2014
---
Honestly, nothing speed relevant; this is mostly as an exercise to get used to
the ASM.
There is at least one thing that bother me; the sub instructions in
ff_vp9_{put,avg}{32,64}. If you see a way of avoiding them with some syntaxic
trick in the loads, I'd be happy to hear about it. If it's possible to do
without these subs, there is probably a way to avoid the r4 copy of r0, and
thus save some stack save overhead.
---
libavcodec/arm/Makefile | 2 +
libavcodec/arm/vp9dsp_init_arm.c | 66 +++++++++++++++
libavcodec/arm/vp9mc_neon.S | 179 +++++++++++++++++++++++++++++++++++++++
libavcodec/vp9dsp.c | 1 +
libavcodec/vp9dsp.h | 1 +
5 files changed, 249 insertions(+)
create mode 100644 libavcodec/arm/vp9dsp_init_arm.c
create mode 100644 libavcodec/arm/vp9mc_neon.S
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index 8ca9021..fca4ce2 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -33,6 +33,7 @@ OBJS-$(CONFIG_VC1_DECODER) += arm/vc1dsp_init_arm.o
OBJS-$(CONFIG_VORBIS_DECODER) += arm/vorbisdsp_init_arm.o
OBJS-$(CONFIG_VP6_DECODER) += arm/vp6dsp_init_arm.o
OBJS-$(CONFIG_VP8_DECODER) += arm/vp8dsp_init_arm.o
+OBJS-$(CONFIG_VP9_DECODER) += arm/vp9dsp_init_arm.o
OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_init_arm.o
OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_init_arm.o \
arm/rv40dsp_init_arm.o
@@ -105,3 +106,4 @@ NEON-OBJS-$(CONFIG_VORBIS_DECODER) += arm/vorbisdsp_neon.o
NEON-OBJS-$(CONFIG_VP6_DECODER) += arm/vp6dsp_neon.o
NEON-OBJS-$(CONFIG_VP8_DECODER) += arm/vp8dsp_init_neon.o \
arm/vp8dsp_neon.o
+NEON-OBJS-$(CONFIG_VP9_DECODER) += arm/vp9mc_neon.o
diff --git a/libavcodec/arm/vp9dsp_init_arm.c b/libavcodec/arm/vp9dsp_init_arm.c
new file mode 100644
index 0000000..47689f5
--- /dev/null
+++ b/libavcodec/arm/vp9dsp_init_arm.c
@@ -0,0 +1,66 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/arm/cpu.h"
+#include "libavcodec/vp9dsp.h"
+
+#include "libavutil/log.h"
+
+#define fpel_func(avg, sz, opt) \
+void ff_vp9_##avg##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
+ const uint8_t *src, ptrdiff_t src_stride, \
+ int h, int mx, int my)
+fpel_func(put, 4, neon);
+fpel_func(put, 8, neon);
+fpel_func(put, 16, neon);
+fpel_func(put, 32, neon);
+fpel_func(put, 64, neon);
+fpel_func(avg, 4, neon);
+fpel_func(avg, 8, neon);
+fpel_func(avg, 16, neon);
+fpel_func(avg, 32, neon);
+fpel_func(avg, 64, neon);
+#undef fpel_func
+
+av_cold void ff_vp9dsp_init_arm(VP9DSPContext *dsp)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+#define init_fpel(idx1, idx2, sz, type, opt) \
+ dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
+ dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
+ dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = \
+ dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_vp9_##type##sz##_##opt
+
+ if (have_neon(cpu_flags)) {
+ //init_fpel(4, 0, 4, put, neon);
+ init_fpel(3, 0, 8, put, neon);
+ init_fpel(2, 0, 16, put, neon);
+ init_fpel(1, 0, 32, put, neon);
+ init_fpel(0, 0, 64, put, neon);
+
+ //init_fpel(4, 1, 4, avg, neon);
+ init_fpel(3, 1, 8, avg, neon);
+ init_fpel(2, 1, 16, avg, neon);
+ init_fpel(1, 1, 32, avg, neon);
+ init_fpel(0, 1, 64, avg, neon);
+ }
+}
diff --git a/libavcodec/arm/vp9mc_neon.S b/libavcodec/arm/vp9mc_neon.S
new file mode 100644
index 0000000..c54cc7f
--- /dev/null
+++ b/libavcodec/arm/vp9mc_neon.S
@@ -0,0 +1,179 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+function ff_vp9_put8_neon, export=1
+ ldr r12, [sp, #0]
+1:
+ subs r12, r12, #4
+ vld1.8 {d0}, [r2], r3
+ vld1.8 {d1}, [r2], r3
+ vld1.8 {d2}, [r2], r3
+ vld1.8 {d3}, [r2], r3
+ vst1.8 {d0}, [r0:64], r1
+ vst1.8 {d1}, [r0:64], r1
+ vst1.8 {d2}, [r0:64], r1
+ vst1.8 {d3}, [r0:64], r1
+ bgt 1b
+ bx lr
+endfunc
+
+function ff_vp9_avg8_neon, export=1
+ push {r4, lr}
+ ldr r12, [sp, #8]
+ mov r4, r0
+1:
+ subs r12, r12, #4
+ vld1.8 {d0}, [r2], r3
+ vld1.8 {d1}, [r2], r3
+ vld1.8 {d2}, [r2], r3
+ vld1.8 {d3}, [r2], r3
+ vld1.8 {d4}, [r4:64], r1
+ vld1.8 {d5}, [r4:64], r1
+ vld1.8 {d6}, [r4:64], r1
+ vld1.8 {d7}, [r4:64], r1
+ vrhadd.u8 q0, q0, q2
+ vrhadd.u8 q1, q1, q3
+ vst1.8 {d0}, [r0:64], r1
+ vst1.8 {d1}, [r0:64], r1
+ vst1.8 {d2}, [r0:64], r1
+ vst1.8 {d3}, [r0:64], r1
+ bgt 1b
+ pop {r4, pc}
+endfunc
+
+function ff_vp9_put16_neon, export=1
+ ldr r12, [sp, #0]
+1:
+ subs r12, r12, #4
+ vld1.8 {q0}, [r2], r3
+ vld1.8 {q1}, [r2], r3
+ vld1.8 {q2}, [r2], r3
+ vld1.8 {q3}, [r2], r3
+ vst1.8 {q0}, [r0:128], r1
+ vst1.8 {q1}, [r0:128], r1
+ vst1.8 {q2}, [r0:128], r1
+ vst1.8 {q3}, [r0:128], r1
+ bgt 1b
+ bx lr
+endfunc
+
+function ff_vp9_avg16_neon, export=1
+ ldr r12, [sp, #0]
+ push {r4, lr}
+ mov r4, r0
+1:
+ subs r12, r12, #2
+ vld1.8 {q0}, [r2], r3
+ vld1.8 {q1}, [r2], r3
+ vld1.8 {q2}, [r4:128], r1
+ vld1.8 {q3}, [r4:128], r1
+ vrhadd.u8 q0, q0, q2
+ vrhadd.u8 q1, q1, q3
+ vst1.8 {q0}, [r0:128], r1
+ vst1.8 {q1}, [r0:128], r1
+ bgt 1b
+ pop {r4, pc}
+endfunc
+
+function ff_vp9_put32_neon, export=1
+ ldr r12, [sp, #0]
+ sub r3, #16
+ sub r1, #16
+1:
+ subs r12, r12, #2
+ vld1.8 {q0}, [r2]!
+ vld1.8 {q1}, [r2], r3
+ vld1.8 {q2}, [r2]!
+ vld1.8 {q3}, [r2], r3
+ vst1.8 {q0}, [r0:128]!
+ vst1.8 {q1}, [r0:128], r1
+ vst1.8 {q2}, [r0:128]!
+ vst1.8 {q3}, [r0:128], r1
+ bgt 1b
+ bx lr
+endfunc
+
+function ff_vp9_avg32_neon, export=1
+ ldr r12, [sp, #0]
+ push {r4, lr}
+ mov r4, r0
+ sub r3, #16
+ sub r1, #16
+1:
+ subs r12, r12, #1
+ vld1.8 {q0}, [r2]!
+ vld1.8 {q1}, [r2], r3
+ vld1.8 {q2}, [r4:128]!
+ vld1.8 {q3}, [r4:128], r1
+ vrhadd.u8 q0, q0, q2
+ vrhadd.u8 q1, q1, q3
+ vst1.8 {q0}, [r0:128]!
+ vst1.8 {q1}, [r0:128], r1
+ bgt 1b
+ pop {r4, pc}
+endfunc
+
+function ff_vp9_put64_neon, export=1
+ ldr r12, [sp, #0]
+ sub r3, #48
+ sub r1, #48
+1:
+ subs r12, r12, #1
+ vld1.8 {q0}, [r2]!
+ vld1.8 {q1}, [r2]!
+ vld1.8 {q2}, [r2]!
+ vld1.8 {q3}, [r2], r3
+ vst1.8 {q0}, [r0:128]!
+ vst1.8 {q1}, [r0:128]!
+ vst1.8 {q2}, [r0:128]!
+ vst1.8 {q3}, [r0:128], r1
+ bgt 1b
+ bx lr
+endfunc
+
+function ff_vp9_avg64_neon, export=1
+ ldr r12, [sp, #0]
+ push {r4, lr}
+ vpush {q4-q7}
+ mov r4, r0
+ sub r3, #48
+ sub r1, #48
+1:
+ subs r12, r12, #1
+ vld1.8 {q0}, [r2]!
+ vld1.8 {q1}, [r2]!
+ vld1.8 {q2}, [r2]!
+ vld1.8 {q3}, [r2], r3
+ vld1.8 {q4}, [r4:128]!
+ vld1.8 {q5}, [r4:128]!
+ vld1.8 {q6}, [r4:128]!
+ vld1.8 {q7}, [r4:128], r1
+ vrhadd.u8 q0, q0, q4
+ vrhadd.u8 q1, q1, q5
+ vrhadd.u8 q2, q2, q6
+ vrhadd.u8 q3, q3, q7
+ vst1.8 {q0}, [r0:128]!
+ vst1.8 {q1}, [r0:128]!
+ vst1.8 {q2}, [r0:128]!
+ vst1.8 {q3}, [r0:128], r1
+ bgt 1b
+ vpop {q4-q7}
+ pop {r4, pc}
+endfunc
diff --git a/libavcodec/vp9dsp.c b/libavcodec/vp9dsp.c
index e2f99f7..2ee749f 100644
--- a/libavcodec/vp9dsp.c
+++ b/libavcodec/vp9dsp.c
@@ -2061,4 +2061,5 @@ av_cold void ff_vp9dsp_init(VP9DSPContext *dsp)
vp9dsp_mc_init(dsp);
if (ARCH_X86) ff_vp9dsp_init_x86(dsp);
+ if (ARCH_ARM) ff_vp9dsp_init_arm(dsp);
}
diff --git a/libavcodec/vp9dsp.h b/libavcodec/vp9dsp.h
index db0a92e..a335173 100644
--- a/libavcodec/vp9dsp.h
+++ b/libavcodec/vp9dsp.h
@@ -114,5 +114,6 @@ typedef struct VP9DSPContext {
void ff_vp9dsp_init(VP9DSPContext *dsp);
void ff_vp9dsp_init_x86(VP9DSPContext *dsp);
+void ff_vp9dsp_init_arm(VP9DSPContext *dsp);
#endif /* AVCODEC_VP9DSP_H */
--
1.9.1
More information about the ffmpeg-devel
mailing list