[FFmpeg-devel] [PATCH 2/3] VP8: Implement bicubic MC functions (put_vp8_epel*)
Rob Clark
rob
Thu Sep 16 20:28:32 CEST 2010
added:
+ put_vp8_epel16_h6v6_neon
+ put_vp8_epel16_h6_neon
+ put_vp8_epel16_v6_neon
+ put_vp8_epel8_v6_neon
+ put_vp8_epel8_h6_neon
+ put_vp8_epel8_h6v6_neon
+ put_vp8_epel8_v4_neon
+ put_vp8_epel8_h4_neon
+ put_vp8_epel8_h4v4_neon
+ put_vp8_epel8_h6v4_neon
+ put_vp8_epel8_h4v6_neon
+ put_vp8_pixels16_neon
+ put_vp8_pixels8_neon
+ put_vp8_pixels4_neon
---
libavcodec/arm/vp8dsp_init_arm.c | 76 ++++
libavcodec/arm/vp8dsp_neon.S | 725 ++++++++++++++++++++++++++++++++++++++
2 files changed, 801 insertions(+), 0 deletions(-)
diff --git a/libavcodec/arm/vp8dsp_init_arm.c b/libavcodec/arm/vp8dsp_init_arm.c
index ab4600d..a51aad7 100644
--- a/libavcodec/arm/vp8dsp_init_arm.c
+++ b/libavcodec/arm/vp8dsp_init_arm.c
@@ -44,6 +44,61 @@ void vp8_v_loop_filter_simple_neon(uint8_t *dst, int stride, int flim);
void vp8_h_loop_filter_simple_neon(uint8_t *dst, int stride, int flim);
+/*
+ * put_vp8_*_pixels_tab functions:
+ */
+void put_vp8_pixels16_neon(uint8_t *dst, int dststride, uint8_t *src,
+ int srcstride, int h, int x, int y);
+void put_vp8_pixels8_neon(uint8_t *dst, int dststride, uint8_t *src,
+ int srcstride, int h, int x, int y);
+void put_vp8_pixels4_neon(uint8_t *dst, int dststride, uint8_t *src,
+ int srcstride, int h, int x, int y);
+void put_vp8_epel16_v6_neon(uint8_t * dst, int dststride, uint8_t * src,
+ int srcstride, int h, int mx, int my);
+void put_vp8_epel16_h6_neon(uint8_t * dst, int dststride, uint8_t * src,
+ int srcstride, int h, int mx, int my);
+void put_vp8_epel16_h6v6_neon(uint8_t * dst, int dststride, uint8_t * src,
+ int srcstride, int h, int mx, int my);
+void put_vp8_epel8_v6_neon (uint8_t * dst, int dststride, uint8_t * src,
+ int srcstride, int h, int mx, int my);
+void put_vp8_epel8_h6_neon (uint8_t * dst, int dststride, uint8_t * src,
+ int srcstride, int h, int mx, int my);
+void put_vp8_epel8_h6v6_neon(uint8_t * dst, int dststride, uint8_t * src,
+ int srcstride, int h, int mx, int my);
+void put_vp8_epel8_v4_neon(uint8_t * dst, int dststride, uint8_t * src,
+ int srcstride, int h, int mx, int my);
+void put_vp8_epel8_h4_neon(uint8_t * dst, int dststride, uint8_t * src,
+ int srcstride, int h, int mx, int my);
+void put_vp8_epel8_h4v4_neon(uint8_t * dst, int dststride, uint8_t * src,
+ int srcstride, int h, int mx, int my);
+void put_vp8_epel8_h6v4_neon(uint8_t * dst, int dststride, uint8_t * src,
+ int srcstride, int h, int mx, int my);
+void put_vp8_epel8_h4v6_neon(uint8_t * dst, int dststride, uint8_t * src,
+ int srcstride, int h, int mx, int my);
+
+#define VP8_MC_FUNC(IDX, SIZE) \
+ dsp->put_vp8_epel_pixels_tab[IDX][0][0] = put_vp8_pixels ## SIZE ## _neon; \
+// dsp->put_vp8_epel_pixels_tab[IDX][0][1] = put_vp8_epel ## SIZE ## _h4_neon; \
+// dsp->put_vp8_epel_pixels_tab[IDX][0][2] = put_vp8_epel ## SIZE ## _h6_neon; \
+// dsp->put_vp8_epel_pixels_tab[IDX][1][0] = put_vp8_epel ## SIZE ## _v4_neon; \
+// dsp->put_vp8_epel_pixels_tab[IDX][1][1] = put_vp8_epel ## SIZE ## _h4v4_neon; \
+// dsp->put_vp8_epel_pixels_tab[IDX][1][2] = put_vp8_epel ## SIZE ## _h6v4_neon; \
+// dsp->put_vp8_epel_pixels_tab[IDX][2][0] = put_vp8_epel ## SIZE ## _v6_neon; \
+// dsp->put_vp8_epel_pixels_tab[IDX][2][1] = put_vp8_epel ## SIZE ## _h4v6_neon; \
+// dsp->put_vp8_epel_pixels_tab[IDX][2][2] = put_vp8_epel ## SIZE ## _h6v6_neon
+
+#define VP8_BILINEAR_MC_FUNC(IDX, SIZE) \
+ dsp->put_vp8_bilinear_pixels_tab[IDX][0][0] = put_vp8_pixels ## SIZE ## _neon; \
+// dsp->put_vp8_bilinear_pixels_tab[IDX][0][1] = put_vp8_bilinear ## SIZE ## _h_neon; \
+// dsp->put_vp8_bilinear_pixels_tab[IDX][0][2] = put_vp8_bilinear ## SIZE ## _h_neon; \
+// dsp->put_vp8_bilinear_pixels_tab[IDX][1][0] = put_vp8_bilinear ## SIZE ## _v_neon; \
+// dsp->put_vp8_bilinear_pixels_tab[IDX][1][1] = put_vp8_bilinear ## SIZE ## _hv_neon; \
+// dsp->put_vp8_bilinear_pixels_tab[IDX][1][2] = put_vp8_bilinear ## SIZE ## _hv_neon; \
+// dsp->put_vp8_bilinear_pixels_tab[IDX][2][0] = put_vp8_bilinear ## SIZE ## _v_neon; \
+// dsp->put_vp8_bilinear_pixels_tab[IDX][2][1] = put_vp8_bilinear ## SIZE ## _hv_neon; \
+// dsp->put_vp8_bilinear_pixels_tab[IDX][2][2] = put_vp8_bilinear ## SIZE ## _hv_neon
+
+
av_cold void ff_vp8dsp_init_arm(VP8DSPContext *dsp)
{
if (HAVE_NEON) {
@@ -61,5 +116,26 @@ av_cold void ff_vp8dsp_init_arm(VP8DSPContext *dsp)
dsp->vp8_v_loop_filter_simple = vp8_v_loop_filter_simple_neon;
dsp->vp8_h_loop_filter_simple = vp8_h_loop_filter_simple_neon;
#endif
+
+ VP8_MC_FUNC(0, 16);
+ VP8_MC_FUNC(1, 8);
+ VP8_MC_FUNC(2, 4);
+
+ VP8_BILINEAR_MC_FUNC(0, 16);
+ VP8_BILINEAR_MC_FUNC(1, 8);
+ VP8_BILINEAR_MC_FUNC(2, 4);
+
+ // XXX
+ dsp->put_vp8_epel_pixels_tab[0][2][0] = put_vp8_epel16_v6_neon;
+ dsp->put_vp8_epel_pixels_tab[0][0][2] = put_vp8_epel16_h6_neon;
+ dsp->put_vp8_epel_pixels_tab[0][2][2] = put_vp8_epel16_h6v6_neon;
+ dsp->put_vp8_epel_pixels_tab[1][2][0] = put_vp8_epel8_v6_neon;
+ dsp->put_vp8_epel_pixels_tab[1][0][2] = put_vp8_epel8_h6_neon;
+ dsp->put_vp8_epel_pixels_tab[1][2][2] = put_vp8_epel8_h6v6_neon;
+ dsp->put_vp8_epel_pixels_tab[1][1][0] = put_vp8_epel8_v4_neon;
+ dsp->put_vp8_epel_pixels_tab[1][0][1] = put_vp8_epel8_h4_neon;
+ dsp->put_vp8_epel_pixels_tab[1][1][1] = put_vp8_epel8_h4v4_neon;
+ dsp->put_vp8_epel_pixels_tab[1][1][2] = put_vp8_epel8_h6v4_neon;
+ dsp->put_vp8_epel_pixels_tab[1][2][1] = put_vp8_epel8_h4v6_neon;
}
}
diff --git a/libavcodec/arm/vp8dsp_neon.S b/libavcodec/arm/vp8dsp_neon.S
index f1d5de2..d741bbd 100644
--- a/libavcodec/arm/vp8dsp_neon.S
+++ b/libavcodec/arm/vp8dsp_neon.S
@@ -473,3 +473,728 @@ endfunc
@{
@}
+/*
+ * NOTE: for the put*_pixels_tab functions, h will be 16, 8, or 4.. for
+ * now, do a loop of 4 rows at a time.. but I should check if just adding
+ * (16-w)*8 to PC is faster.. (neon instructions are 4 bytes, one vld, one
+ * vst per row..
+ */
+
+/*
+ * NOTE: this is a bit lame, the vp8_mc_func prototype doesn't match similar
+ * functions in dsputil:
+ *
+ * func(uint8_t *dst, const uint8_t *src, int stride, int h, ...)
+ *
+ * to match existing functions in dsputil. When the function prototype is
+ * aligned, these functions should be removed and replaced with the common
+ * functions from dsputil_neon.S:
+ *
+ * + put_vp8_pixels16_neon
+ * + put_vp8_pixels8_neon
+ * + put_vp8_pixels4_neon (which is really just _arm..)
+ */
+
+ at void put_vp8_pixels16_neon(uint8_t *dst, int dststride, uint8_t *src, int srcstride, int h, int x, int y)
+@{
+function put_vp8_pixels16_neon, export=1
+ ldr r12, [sp, #0] @ load 'h' from stack
+1:
+ sub r12, r12, #4
+ cmp r12, #0
+ vld1.8 {q0}, [r2], r3
+ vld1.8 {q1}, [r2], r3
+ vld1.8 {q2}, [r2], r3
+ vld1.8 {q3}, [r2], r3
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q1}, [r0], r1
+ vst1.8 {q2}, [r0], r1
+ vst1.8 {q3}, [r0], r1
+ bgt 1b
+ bx lr
+endfunc
+@}
+
+ at void put_vp8_pixels8_neon(uint8_t *dst, int dststride, uint8_t *src, int srcstride, int h, int x, int y)
+@{
+function put_vp8_pixels8_neon, export=1
+ ldr r12, [sp, #0] @ load 'h' from stack
+1:
+ sub r12, r12, #4
+ cmp r12, #0
+ vld1.8 {d0}, [r2], r3
+ vld1.8 {d1}, [r2], r3
+ vld1.8 {d2}, [r2], r3
+ vld1.8 {d3}, [r2], r3
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d1}, [r0], r1
+ vst1.8 {d2}, [r0], r1
+ vst1.8 {d3}, [r0], r1
+ bgt 1b
+ bx lr
+endfunc
+@}
+
+ at void put_vp8_pixels4_neon(uint8_t *dst, int dststride, uint8_t *src, int srcstride, int h, int x, int y)
+@{
+function put_vp8_pixels4_neon, export=1
+ ldr r12, [sp, #0] @ load 'h' from stack
+ push {r4-r7}
+1:
+ sub r12, r12, #4
+ cmp r12, #0
+ ldr r4, [r2], r3
+ ldr r5, [r2], r3
+ ldr r6, [r2], r3
+ ldr r7, [r2], r3
+ str r4, [r0], r1
+ str r5, [r0], r1
+ str r6, [r0], r1
+ str r7, [r0], r1
+ bgt 1b
+ pop {r4-r7}
+ bx lr
+endfunc
+@}
+
+@ Register layout:
+@ \a and \b -> src[0..5] (preserved, can be d2-d4)
+@ \ret -> dst (return, can be d2-d4)
+@ d0-d1 -> filter (preserved)
+@ Uses:
+@ q8-q15 (although could be reduced)
+.macro put_vp8_epel8_h6, a, b, ret
+ @ note: in put_vp8_epel16_* case, we could perhaps avoid some vext's
+ @ by doing these as quad-word instructions outside of this macro..
+ vext.8 d27, \a, \b, #1 @ src[x + 1]
+ vext.8 d28, \a, \b, #2 @ src[x + 2]
+ vext.8 d29, \a, \b, #3 @ src[x + 3]
+ vext.8 d30, \a, \b, #4 @ src[x + 4]
+ vext.8 d31, \a, \b, #5 @ src[x + 5]
+ vmovl.u8 q10, d28 @ (u16)src[x + 2]
+ vmovl.u8 q9, d27 @ (u16)src[x + 1]
+ vmovl.u8 q8, \a @ (u16)src[x + 0]
+ vmovl.u8 q11, d29 @ (u16)src[x + 3]
+ vmovl.u8 q12, d30 @ (u16)src[x + 4]
+ vmovl.u8 q13, d31 @ (u16)src[x + 5]
+ vmul.u16 q10, q10, d0[2] @ a: filter[2] * src[x + 2] -
+ vmul.u16 q9, q9, d0[1] @ b: filter[1] * src[x + 1] +
+ vmul.u16 q8, q8, d0[0] @ c: filter[0] * src[x + 0] +
+ vmul.u16 q11, q11, d0[3] @ d: filter[3] * src[x + 3] -
+ vmul.u16 q12, q12, d1[0] @ e: filter[4] * src[x + 4] +
+ vmul.u16 q13, q13, d1[1] @ f: filter[5] * src[x + 5] +
+ vsub.s16 q10, q10, q9 @ a - b
+ vsub.s16 q11, q11, q12 @ d - e
+ vadd.s16 q10, q10, q8 @ (a - b) + c
+ vadd.s16 q11, q11, q13 @ (d - e) + f
+ vqadd.s16 q11, q10, q11 @ (a - b + c) + (d - e + f)
+ vqrshrun.s16 \ret, q11, #7
+.endm
+
+@ Register layout:
+@ d4,d27-d31-> src[0..5]
+@ \ret -> dst (return, can be d2-d4)
+@ d0-d1 -> filter (preserved)
+@ Uses:
+@ q8-q15 (although could be reduced)
+.macro put_vp8_epel8_v6, ret
+ @ XXX below is similar to put_vp8_epel8_h6.. combine into single macro!
+ vmovl.u8 q10, d28 @ (u16)src[x + 2]
+ vmovl.u8 q9, d27 @ (u16)src[x + 1]
+ vmovl.u8 q8, d4 @ (u16)src[x + 0]
+ vmovl.u8 q11, d29 @ (u16)src[x + 3]
+ vmovl.u8 q12, d30 @ (u16)src[x + 4]
+ vmovl.u8 q13, d31 @ (u16)src[x + 5]
+ vmul.u16 q10, q10, d0[2] @ a: filter[2] * src[x + 2] -
+ vmul.u16 q9, q9, d0[1] @ b: filter[1] * src[x + 1] +
+ vmul.u16 q8, q8, d0[0] @ c: filter[0] * src[x + 0] +
+ vmul.u16 q11, q11, d0[3] @ d: filter[3] * src[x + 3] -
+ vmul.u16 q12, q12, d1[0] @ e: filter[4] * src[x + 4] +
+ vmul.u16 q13, q13, d1[1] @ f: filter[5] * src[x + 5] +
+ vsub.s16 q10, q10, q9 @ a - b
+ vsub.s16 q11, q11, q12 @ d - e
+ vadd.s16 q10, q10, q8 @ (a - b) + c
+ vadd.s16 q11, q11, q13 @ (d - e) + f
+ vqadd.s16 q11, q10, q11 @ (a - b + c) + (d - e + f)
+ vqrshrun.s16 \ret, q11, #7
+.endm
+
+@ Register layout:
+@ \a and \b -> src[0..5] (preserved, can be d2-d4)
+@ \ret -> dst (return, can be d2-d4)
+@ d0-d1 -> filter (preserved)
+@ Uses:
+@ q8-q15 (although could be reduced)
+.macro put_vp8_epel8_h4, a, b, ret
+ @ note: in put_vp8_epel16_* case, we could perhaps avoid some vext's
+ @ by doing these as quad-word instructions outside of this macro..
+ vext.8 d27, \a, \b, #1 @ src[x + 1]
+ vext.8 d28, \a, \b, #2 @ src[x + 2]
+ vext.8 d29, \a, \b, #3 @ src[x + 3]
+ vext.8 d30, \a, \b, #4 @ src[x + 4]
+ vmovl.u8 q10, d28 @ (u16)src[x + 2]
+ vmovl.u8 q9, d27 @ (u16)src[x + 1]
+ vmovl.u8 q11, d29 @ (u16)src[x + 3]
+ vmovl.u8 q12, d30 @ (u16)src[x + 4]
+ vmul.u16 q10, q10, d0[2] @ a: filter[2] * src[x + 2] -
+ vmul.u16 q9, q9, d0[1] @ b: filter[1] * src[x + 1] +
+ vmul.u16 q11, q11, d0[3] @ d: filter[3] * src[x + 3] -
+ vmul.u16 q12, q12, d1[0] @ e: filter[4] * src[x + 4] +
+ vsub.s16 q10, q10, q9 @ a - b
+ vsub.s16 q11, q11, q12 @ d - e
+ vqadd.s16 q11, q10, q11 @ (a - b) + (d - e)
+ vqrshrun.s16 \ret, q11, #7
+.endm
+
+@ Register layout:
+@ d27-d30 -> src[1..4]
+@ \ret -> dst (return, can be d2-d4)
+@ d0-d1 -> filter (preserved)
+@ Uses:
+@ q8-q15 (although could be reduced)
+.macro put_vp8_epel8_v4, ret
+ @ XXX below is similar to put_vp8_epel8_h4.. combine into single macro!
+ vmovl.u8 q10, d28 @ (u16)src[x + 2]
+ vmovl.u8 q9, d27 @ (u16)src[x + 1]
+ vmovl.u8 q11, d29 @ (u16)src[x + 3]
+ vmovl.u8 q12, d30 @ (u16)src[x + 4]
+ vmul.u16 q10, q10, d0[2] @ a: filter[2] * src[x + 2] -
+ vmul.u16 q9, q9, d0[1] @ b: filter[1] * src[x + 1] +
+ vmul.u16 q11, q11, d0[3] @ d: filter[3] * src[x + 3] -
+ vmul.u16 q12, q12, d1[0] @ e: filter[4] * src[x + 4] +
+ vsub.s16 q10, q10, q9 @ a - b
+ vsub.s16 q11, q11, q12 @ d - e
+ vqadd.s16 q11, q10, q11 @ (a - b) + (d - e)
+ vqrshrun.s16 \ret, q11, #7
+.endm
+
+ at void put_vp8_epel16_v6_neon(uint8_t * dst, int dststride, uint8_t * src, int srcstride, int h, int mx, int my)
+@{
+function put_vp8_epel16_v6_neon, export=1
+ sub r2, r2, r3, lsl #1 @ subtract two rows
+ push {r4-r5}
+
+ @ note that this is somewhat similar to the second part of
+ @ put_vp8_epel16_h6v6_neon, so potental to be factored out
+ @ into a common macro??
+ ldr r4, [sp, #16] @ load 'my' from stack
+ ldr r5, subpel_filters
+ sub r4, r4, #1
+ ldr r12, [sp, #8] @ load 'h' from stack
+ add r4, r5, r4, lsl #4
+ vld1.16 {d0-d1}, [r4, :64] @ filter = subpel_filters[my - 1]
+1:
+ vld1.8 {d4}, [r2], r3
+ vld1.8 {d27}, [r2], r3
+ vld1.8 {d28}, [r2], r3
+ vld1.8 {d29}, [r2], r3
+ vld1.8 {d30}, [r2], r3
+ vld1.8 {d31}, [r2]
+
+ sub r2, r2, r3, lsl #2
+ sub r2, r2, r3
+ add r2, r2, #8
+
+ put_vp8_epel8_v6 ret=d2
+
+ vld1.8 {d4}, [r2], r3
+ vld1.8 {d27}, [r2], r3
+ vld1.8 {d28}, [r2], r3
+ vld1.8 {d29}, [r2], r3
+ vld1.8 {d30}, [r2], r3
+ vld1.8 {d31}, [r2]
+
+ sub r2, r2, r3, lsl #2
+ sub r2, r2, #8
+
+ put_vp8_epel8_v6 ret=d3
+
+ vst1.8 {d2-d3}, [r0], r1
+ sub r12, r12, #1 @ h--
+ cmp r12, #0
+ bne 1b
+
+ pop {r4-r5}
+ bx lr
+endfunc
+@}
+
+ at void put_vp8_epel16_h6_neon(uint8_t * dst, int dststride, uint8_t * src, int srcstride, int h, int mx, int my)
+@{
+function put_vp8_epel16_h6_neon, export=1
+ sub r2, r2, #2 @ subtract two cols
+ push {r4-r5}
+
+ ldr r4, [sp, #12] @ load 'mx' from stack
+ ldr r5, subpel_filters
+ sub r4, r4, #1
+ ldr r12, [sp, #8] @ load 'h' from stack
+ add r4, r5, r4, lsl #4
+ vld1.16 {d0-d1}, [r4, :64] @ filter = subpel_filters[mx - 1]
+1:
+ vld1.8 {d2,d3,d4}, [r2], r3 @ load src (last 3 bytes unused)
+
+ put_vp8_epel8_h6 a=d2, b=d3, ret=d2
+ put_vp8_epel8_h6 a=d3, b=d4, ret=d3
+
+ vst1.8 {d2-d3}, [r0], r1
+ sub r12, r12, #1 @ h--
+ cmp r12, #0
+ bne 1b
+
+ pop {r4-r5}
+ bx lr
+endfunc
+@}
+
+ at void put_vp8_epel16_h6v6_neon(uint8_t * dst, int dststride, uint8_t * src, int srcstride, int h, int mx, int my)
+@{
+function put_vp8_epel16_h6v6_neon, export=1
+ sub r2, r2, r3, lsl #1 @ subtract two rows
+ sub r2, r2, #2 @ subtract two cols
+ push {r4-r5}
+
+ @ tmp_array is the 336 bytes below stack-ptr.. note: C code uses 592
+ @ bytes, but this seems unnecessary. (But would it be better to get
+ @ 16 byte alignment of tmp_array, instead of 8?)
+
+ @ first pass (horizontal):
+ ldr r4, [sp, #12] @ load 'mx' from stack
+ ldr r5, subpel_filters
+ sub r4, r4, #1
+ ldr r12, [sp, #8] @ load 'h' from stack
+ add r4, r5, r4, lsl #4
+ vld1.16 {d0-d1}, [r4, :64] @ filter = subpel_filters[mx - 1]
+ sub r5, sp, #336 @ tmp = tmp_array
+ add r12, r12, #5 @ h += 5
+1:
+ vld1.8 {d2,d3,d4}, [r2], r3 @ load src (last 3 bytes unused)
+
+ put_vp8_epel8_h6 a=d2, b=d3, ret=d2
+ put_vp8_epel8_h6 a=d3, b=d4, ret=d3
+
+ vst1.8 {d2-d3}, [r5]!
+ sub r12, r12, #1 @ h--
+ cmp r12, #0
+ bne 1b
+
+ @ second pass (vertical):
+ ldr r4, [sp, #16] @ load 'my' from stack
+ ldr r5, subpel_filters
+ sub r4, r4, #1
+ ldr r12, [sp, #8] @ load 'h' from stack
+ add r4, r5, r4, lsl #4
+ vld1.16 {d0-d1}, [r4, :64] @ filter = subpel_filters[my - 1]
+ sub r5, sp, #336 @ tmp = tmp_array
+ mov r4, #16
+2:
+ vld1.8 {d4}, [r5], r4
+ vld1.8 {d27}, [r5], r4
+ vld1.8 {d28}, [r5], r4
+ vld1.8 {d29}, [r5], r4
+ vld1.8 {d30}, [r5], r4
+ vld1.8 {d31}, [r5]
+ sub r5, r5, #72 @ tmp -= (16 * 5) - 8
+
+ put_vp8_epel8_v6 ret=d2
+
+ vld1.8 {d4}, [r5], r4
+ vld1.8 {d27}, [r5], r4
+ vld1.8 {d28}, [r5], r4
+ vld1.8 {d29}, [r5], r4
+ vld1.8 {d30}, [r5], r4
+ vld1.8 {d31}, [r5]
+ sub r5, r5, #72 @ tmp -= (16 * 5) - 8
+
+ put_vp8_epel8_v6 ret=d3
+
+ vst1.8 {d2-d3}, [r0], r1
+ sub r12, r12, #1 @ h--
+ cmp r12, #0
+ bne 2b
+
+ pop {r4-r5}
+ bx lr
+endfunc
+@}
+
+ at void put_vp8_epel8_v6_neon (uint8_t * dst, int dststride, uint8_t * src, int srcstride, int h, int mx, int my)
+@{
+function put_vp8_epel8_v6_neon, export=1
+ sub r2, r2, r3, lsl #1 @ subtract two rows
+ push {r4-r5}
+
+ @ note that this is somewhat similar to the second part of
+ @ put_vp8_epel16_h6v6_neon, so potental to be factored out
+ @ into a common macro??
+ ldr r4, [sp, #16] @ load 'my' from stack
+ ldr r5, subpel_filters
+ sub r4, r4, #1
+ ldr r12, [sp, #8] @ load 'h' from stack
+ add r4, r5, r4, lsl #4
+ vld1.16 {d0-d1}, [r4, :64] @ filter = subpel_filters[my - 1]
+1:
+ vld1.8 {d4}, [r2], r3
+ vld1.8 {d27}, [r2], r3
+ vld1.8 {d28}, [r2], r3
+ vld1.8 {d29}, [r2], r3
+ vld1.8 {d30}, [r2], r3
+ vld1.8 {d31}, [r2]
+
+ sub r2, r2, r3, lsl #2
+
+ put_vp8_epel8_v6 ret=d2
+
+ vst1.8 {d2}, [r0], r1
+ sub r12, r12, #1 @ h--
+ cmp r12, #0
+ bne 1b
+
+ pop {r4-r5}
+ bx lr
+endfunc
+@}
+
+ at void put_vp8_epel8_h6_neon (uint8_t * dst, int dststride, uint8_t * src, int srcstride, int h, int mx, int my)
+@{
+function put_vp8_epel8_h6_neon, export=1
+ sub r2, r2, #2 @ subtract two cols
+ push {r4-r5}
+
+ ldr r4, [sp, #12] @ load 'mx' from stack
+ ldr r5, subpel_filters
+ sub r4, r4, #1
+ ldr r12, [sp, #8] @ load 'h' from stack
+ add r4, r5, r4, lsl #4
+ vld1.16 {d0-d1}, [r4, :64] @ filter = subpel_filters[mx - 1]
+1:
+ vld1.8 {d2,d3}, [r2], r3 @ load src (last 3 bytes unused)
+
+ put_vp8_epel8_h6 a=d2, b=d3, ret=d2
+
+ vst1.8 {d2}, [r0], r1
+ sub r12, r12, #1 @ h--
+ cmp r12, #0
+ bne 1b
+
+ pop {r4-r5}
+ bx lr
+endfunc
+@}
+
+ at void put_vp8_epel8_h6v6_neon(uint8_t * dst, int dststride, uint8_t * src, int srcstride, int h, int mx, int my)
+@{
+function put_vp8_epel8_h6v6_neon, export=1
+ sub r2, r2, r3, lsl #1 @ subtract two rows
+ sub r2, r2, #2 @ subtract two cols
+ push {r4-r5}
+
+ @ tmp_array is the 168 bytes below stack-ptr.. note: check if the 8-col
+ @ functions would ever get called with heigh of 16? Maybe tmp_array
+ @ could be smaller. (But would it be better to get 16 byte alignment of
+ @ tmp_array, instead of 8?)
+
+ @ first pass (horizontal):
+ ldr r4, [sp, #12] @ load 'mx' from stack
+ ldr r5, subpel_filters
+ sub r4, r4, #1
+ ldr r12, [sp, #8] @ load 'h' from stack
+ add r4, r5, r4, lsl #4
+ vld1.16 {d0-d1}, [r4, :64] @ filter = subpel_filters[mx - 1]
+ sub r5, sp, #168 @ tmp = tmp_array
+ add r12, r12, #5 @ h += 5
+1:
+ vld1.8 {d2,d3}, [r2], r3 @ load src (last 3 bytes unused)
+
+ put_vp8_epel8_h6 a=d2, b=d3, ret=d2
+
+ vst1.8 {d2}, [r5]!
+ sub r12, r12, #1 @ h--
+ cmp r12, #0
+ bne 1b
+
+ @ second pass (vertical):
+ ldr r4, [sp, #16] @ load 'my' from stack
+ ldr r5, subpel_filters
+ sub r4, r4, #1
+ ldr r12, [sp, #8] @ load 'h' from stack
+ add r4, r5, r4, lsl #4
+ vld1.16 {d0-d1}, [r4, :64] @ filter = subpel_filters[my - 1]
+ sub r5, sp, #168 @ tmp = tmp_array
+ mov r4, #8
+2:
+ vld1.8 {d4}, [r5], r4
+ vld1.8 {d27}, [r5], r4
+ vld1.8 {d28}, [r5], r4
+ vld1.8 {d29}, [r5], r4
+ vld1.8 {d30}, [r5], r4
+ vld1.8 {d31}, [r5]
+ sub r5, r5, #32 @ tmp -= (8 * 5) - 8
+
+ put_vp8_epel8_v6 ret=d2
+
+ vst1.8 {d2}, [r0], r1
+ sub r12, r12, #1 @ h--
+ cmp r12, #0
+ bne 2b
+
+ pop {r4-r5}
+ bx lr
+endfunc
+@}
+
+ at void put_vp8_epel8_v4_neon(uint8_t * dst, int dststride, uint8_t * src, int srcstride, int h, int mx, int my)
+@{
+function put_vp8_epel8_v4_neon, export=1
+ sub r2, r2, r3 @ subtract one rows
+ push {r4-r5}
+
+ @ note that this is somewhat similar to the second part of
+ @ put_vp8_epel16_h6v6_neon, so potental to be factored out
+ @ into a common macro??
+ ldr r4, [sp, #16] @ load 'my' from stack
+ ldr r5, subpel_filters
+ sub r4, r4, #1
+ ldr r12, [sp, #8] @ load 'h' from stack
+ add r4, r5, r4, lsl #4
+ vld1.16 {d0-d1}, [r4, :64] @ filter = subpel_filters[my - 1]
+1:
+ vld1.8 {d27}, [r2], r3
+ vld1.8 {d28}, [r2], r3
+ vld1.8 {d29}, [r2], r3
+ vld1.8 {d30}, [r2]
+ sub r2, r2, r3, lsl #1
+
+ put_vp8_epel8_v4 ret=d2
+
+ vst1.8 {d2}, [r0], r1
+ sub r12, r12, #1 @ h--
+ cmp r12, #0
+ bne 1b
+
+ pop {r4-r5}
+ bx lr
+endfunc
+@}
+
+ at void put_vp8_epel8_h4_neon(uint8_t * dst, int dststride, uint8_t * src, int srcstride, int h, int mx, int my)
+@{
+function put_vp8_epel8_h4_neon, export=1
+ sub r2, r2, #2 @ subtract two cols
+ push {r4-r5}
+
+ ldr r4, [sp, #12] @ load 'mx' from stack
+ ldr r5, subpel_filters
+ sub r4, r4, #1
+ ldr r12, [sp, #8] @ load 'h' from stack
+ add r4, r5, r4, lsl #4
+ vld1.16 {d0-d1}, [r4, :64] @ filter = subpel_filters[mx - 1]
+1:
+ vld1.8 {d2,d3}, [r2], r3 @ load src (last 3 bytes unused)
+
+ put_vp8_epel8_h4 a=d2, b=d3, ret=d2
+
+ vst1.8 {d2}, [r0], r1
+ sub r12, r12, #1 @ h--
+ cmp r12, #0
+ bne 1b
+
+ pop {r4-r5}
+ bx lr
+endfunc
+@}
+
+ at void put_vp8_epel8_h4v4_neon(uint8_t * dst, int dststride, uint8_t * src, int srcstride, int h, int mx, int my)
+@{
+function put_vp8_epel8_h4v4_neon, export=1
+ sub r2, r2, r3, lsl #1 @ subtract two rows
+ sub r2, r2, #2 @ subtract two cols
+ push {r4-r5}
+
+ @ tmp_array is the 168 bytes below stack-ptr.. note: check if the 8-col
+ @ functions would ever get called with heigh of 16? Maybe tmp_array
+ @ could be smaller. (But would it be better to get 16 byte alignment of
+ @ tmp_array, instead of 8?)
+
+ @ first pass (horizontal):
+ ldr r4, [sp, #12] @ load 'mx' from stack
+ ldr r5, subpel_filters
+ sub r4, r4, #1
+ ldr r12, [sp, #8] @ load 'h' from stack
+ add r4, r5, r4, lsl #4
+ vld1.16 {d0-d1}, [r4, :64] @ filter = subpel_filters[mx - 1]
+ sub r5, sp, #168 @ tmp = tmp_array
+ add r12, r12, #5 @ h += 5
+1:
+ vld1.8 {d2,d3}, [r2], r3 @ load src (last 3 bytes unused)
+
+ put_vp8_epel8_h4 a=d2, b=d3, ret=d2
+
+ vst1.8 {d2}, [r5]!
+ sub r12, r12, #1 @ h--
+ cmp r12, #0
+ bne 1b
+
+ @ second pass (vertical):
+ ldr r4, [sp, #16] @ load 'my' from stack
+ ldr r5, subpel_filters
+ sub r4, r4, #1
+ ldr r12, [sp, #8] @ load 'h' from stack
+ add r4, r5, r4, lsl #4
+ vld1.16 {d0-d1}, [r4, :64] @ filter = subpel_filters[my - 1]
+ sub r5, sp, #168 @ tmp = tmp_array
+ mov r4, #8
+ add r5, #8
+2:
+ vld1.8 {d27}, [r5], r4
+ vld1.8 {d28}, [r5], r4
+ vld1.8 {d29}, [r5], r4
+ vld1.8 {d30}, [r5]
+ sub r5, r5, #16 @ tmp -= (8 * 3) - 8
+
+ put_vp8_epel8_v4 ret=d2
+
+ vst1.8 {d2}, [r0], r1
+ sub r12, r12, #1 @ h--
+ cmp r12, #0
+ bne 2b
+
+ pop {r4-r5}
+ bx lr
+endfunc
+@}
+
+ at void put_vp8_epel8_h6v4_neon(uint8_t * dst, int dststride, uint8_t * src, int srcstride, int h, int mx, int my)
+@{
+function put_vp8_epel8_h6v4_neon, export=1
+ sub r2, r2, r3, lsl #1 @ subtract two rows
+ sub r2, r2, #2 @ subtract two cols
+ push {r4-r5}
+
+ @ tmp_array is the 168 bytes below stack-ptr.. note: check if the 8-col
+ @ functions would ever get called with heigh of 16? Maybe tmp_array
+ @ could be smaller. (But would it be better to get 16 byte alignment of
+ @ tmp_array, instead of 8?)
+
+ @ first pass (horizontal):
+ ldr r4, [sp, #12] @ load 'mx' from stack
+ ldr r5, subpel_filters
+ sub r4, r4, #1
+ ldr r12, [sp, #8] @ load 'h' from stack
+ add r4, r5, r4, lsl #4
+ vld1.16 {d0-d1}, [r4, :64] @ filter = subpel_filters[mx - 1]
+ sub r5, sp, #168 @ tmp = tmp_array
+ add r12, r12, #5 @ h += 5
+1:
+ vld1.8 {d2,d3}, [r2], r3 @ load src (last 3 bytes unused)
+
+ put_vp8_epel8_h6 a=d2, b=d3, ret=d2
+
+ vst1.8 {d2}, [r5]!
+ sub r12, r12, #1 @ h--
+ cmp r12, #0
+ bne 1b
+
+ @ second pass (vertical):
+ ldr r4, [sp, #16] @ load 'my' from stack
+ ldr r5, subpel_filters
+ sub r4, r4, #1
+ ldr r12, [sp, #8] @ load 'h' from stack
+ add r4, r5, r4, lsl #4
+ vld1.16 {d0-d1}, [r4, :64] @ filter = subpel_filters[my - 1]
+ sub r5, sp, #168 @ tmp = tmp_array
+ mov r4, #8
+ add r5, #8
+2:
+ vld1.8 {d27}, [r5], r4
+ vld1.8 {d28}, [r5], r4
+ vld1.8 {d29}, [r5], r4
+ vld1.8 {d30}, [r5]
+ sub r5, r5, #16 @ tmp -= (8 * 3) - 8
+
+ put_vp8_epel8_v4 ret=d2
+
+ vst1.8 {d2}, [r0], r1
+ sub r12, r12, #1 @ h--
+ cmp r12, #0
+ bne 2b
+
+ pop {r4-r5}
+ bx lr
+endfunc
+@}
+
+ at void put_vp8_epel8_h4v6_neon(uint8_t * dst, int dststride, uint8_t * src, int srcstride, int h, int mx, int my)
+@{
+function put_vp8_epel8_h4v6_neon, export=1
+ sub r2, r2, r3, lsl #1 @ subtract two rows
+ sub r2, r2, #2 @ subtract two cols
+ push {r4-r5}
+
+ @ tmp_array is the 168 bytes below stack-ptr.. note: check if the 8-col
+ @ functions would ever get called with heigh of 16? Maybe tmp_array
+ @ could be smaller. (But would it be better to get 16 byte alignment of
+ @ tmp_array, instead of 8?)
+
+ @ first pass (horizontal):
+ ldr r4, [sp, #12] @ load 'mx' from stack
+ ldr r5, subpel_filters
+ sub r4, r4, #1
+ ldr r12, [sp, #8] @ load 'h' from stack
+ add r4, r5, r4, lsl #4
+ vld1.16 {d0-d1}, [r4, :64] @ filter = subpel_filters[mx - 1]
+ sub r5, sp, #168 @ tmp = tmp_array
+ add r12, r12, #5 @ h += 5
+1:
+ vld1.8 {d2,d3}, [r2], r3 @ load src (last 3 bytes unused)
+
+ put_vp8_epel8_h4 a=d2, b=d3, ret=d2
+
+ vst1.8 {d2}, [r5]!
+ sub r12, r12, #1 @ h--
+ cmp r12, #0
+ bne 1b
+
+ @ second pass (vertical):
+ ldr r4, [sp, #16] @ load 'my' from stack
+ ldr r5, subpel_filters
+ sub r4, r4, #1
+ ldr r12, [sp, #8] @ load 'h' from stack
+ add r4, r5, r4, lsl #4
+ vld1.16 {d0-d1}, [r4, :64] @ filter = subpel_filters[my - 1]
+ sub r5, sp, #168 @ tmp = tmp_array
+ mov r4, #8
+2:
+ vld1.8 {d4}, [r5], r4
+ vld1.8 {d27}, [r5], r4
+ vld1.8 {d28}, [r5], r4
+ vld1.8 {d29}, [r5], r4
+ vld1.8 {d30}, [r5], r4
+ vld1.8 {d31}, [r5]
+ sub r5, r5, #32 @ tmp -= (8 * 5) - 8
+
+ put_vp8_epel8_v6 ret=d2
+
+ vst1.8 {d2}, [r0], r1
+ sub r12, r12, #1 @ h--
+ cmp r12, #0
+ bne 2b
+
+ pop {r4-r5}
+ bx lr
+endfunc
+@}
+
+@ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
+@ arithmatic can be used to apply filters
+subpel_filters:
+ .long _subpel_filters
+ .align 8
+_subpel_filters:
+ .short 0, 6, 123, 12, 1, 0, 0, 0
+ .short 2, 11, 108, 36, 8, 1, 0, 0
+ .short 0, 9, 93, 50, 6, 0, 0, 0
+ .short 3, 16, 77, 77, 16, 3, 0, 0
+ .short 0, 6, 50, 93, 9, 0, 0, 0
+ .short 1, 8, 36, 108, 11, 2, 0, 0
+ .short 0, 1, 12, 123, 6, 0, 0, 0
--
1.7.1.1
More information about the ffmpeg-devel
mailing list