[FFmpeg-devel] [PATCH] NEON: VC1 no_rnd chroma MC
Måns Rullgård
mans
Thu Apr 16 22:29:12 CEST 2009
David Conrad <lessen42 at gmail.com> writes:
> Hi,
>
> This extends the h264_chroma_mc8 macro to also make no_rnd variants
> for VC1.
> 10-15% overall decode speedup depending on source.
>
> diff --git a/libavcodec/arm/h264dsp_neon.S b/libavcodec/arm/h264dsp_neon.S
> index 44a1373..0f1c467 100644
> --- a/libavcodec/arm/h264dsp_neon.S
> +++ b/libavcodec/arm/h264dsp_neon.S
> @@ -56,13 +56,16 @@
> .endm
>
> /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
> - .macro h264_chroma_mc8 type
> -function ff_\type\()_h264_chroma_mc8_neon, export=1
> + .macro h264_chroma_mc8 type name=h264 vshrn=vrshrn.u16 no_rnd=0
> +function ff_\type\()_\name\()_chroma_mc8_neon, export=1
> push {r4-r7, lr}
> ldrd r4, [sp, #20]
> .ifc \type,avg
> mov lr, r0
> .endif
> +.if \no_rnd
> + vmov.u16 q15, #28
> +.endif
> pld [r1]
> pld [r1, r2]
>
> @@ -100,10 +103,14 @@ function ff_\type\()_h264_chroma_mc8_neon, export=1
> vmlal.u8 q9, d7, d1
> vmlal.u8 q9, d4, d2
> vmlal.u8 q9, d5, d3
> - vrshrn.u16 d16, q8, #6
> +.if \no_rnd
> + vadd.u16 q8, q8, q15
> + vadd.u16 q9, q9, q15
> +.endif
This will stall waiting for q9.
> + \vshrn d16, q8, #6
> vld1.64 {d6, d7}, [r5], r4
> pld [r1]
> - vrshrn.u16 d17, q9, #6
> + \vshrn d17, q9, #6
> .ifc \type,avg
> vld1.64 {d20}, [lr,:64], r2
> vld1.64 {d21}, [lr,:64], r2
> @@ -135,8 +142,12 @@ function ff_\type\()_h264_chroma_mc8_neon, export=1
> vmull.u8 q9, d6, d0
> vmlal.u8 q9, d4, d1
> vld1.64 {d6}, [r5], r4
> - vrshrn.u16 d16, q8, #6
> - vrshrn.u16 d17, q9, #6
> +.if \no_rnd
> + vadd.u16 q8, q8, q15
> + vadd.u16 q9, q9, q15
> +.endif
> + \vshrn d16, q8, #6
> + \vshrn d17, q9, #6
Ditto.
> .ifc \type,avg
> vld1.64 {d20}, [lr,:64], r2
> vld1.64 {d21}, [lr,:64], r2
> @@ -162,10 +173,14 @@ function ff_\type\()_h264_chroma_mc8_neon, export=1
> vld1.64 {d4, d5}, [r1], r2
> vmull.u8 q9, d6, d0
> vmlal.u8 q9, d7, d1
> +.if \no_rnd
> + vadd.u16 q8, q8, q15
> + vadd.u16 q9, q9, q15
> +.endif
Ditto.
Is there no way to move those adds down a bit?
--
M?ns Rullg?rd
mans at mansr.com
More information about the ffmpeg-devel
mailing list