[FFmpeg-devel] [PATCH] NEON: VC1 no_rnd chroma MC

Thu Apr 16 22:29:12 CEST 2009

David Conrad <lessen42 at gmail.com> writes:

> Hi,
>
> This extends the h264_chroma_mc8 macro to also make no_rnd variants
> for VC1.
> 10-15% overall decode speedup depending on source.
>
> diff --git a/libavcodec/arm/h264dsp_neon.S b/libavcodec/arm/h264dsp_neon.S
> index 44a1373..0f1c467 100644
> --- a/libavcodec/arm/h264dsp_neon.S
> +++ b/libavcodec/arm/h264dsp_neon.S
> @@ -56,13 +56,16 @@
>          .endm
>
>  /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
> -        .macro  h264_chroma_mc8 type
> -function ff_\type\()_h264_chroma_mc8_neon, export=1
> +        .macro  h264_chroma_mc8 type name=h264 vshrn=vrshrn.u16 no_rnd=0
> +function ff_\type\()_\name\()_chroma_mc8_neon, export=1
>          push            {r4-r7, lr}
>          ldrd            r4,  [sp, #20]
>  .ifc \type,avg
>          mov             lr,  r0
>  .endif
> +.if \no_rnd
> +        vmov.u16        q15, #28
> +.endif
>          pld             [r1]
>          pld             [r1, r2]
>
> @@ -100,10 +103,14 @@ function ff_\type\()_h264_chroma_mc8_neon, export=1
>          vmlal.u8        q9,  d7,  d1
>          vmlal.u8        q9,  d4,  d2
>          vmlal.u8        q9,  d5,  d3
> -        vrshrn.u16      d16, q8,  #6
> +.if \no_rnd
> +        vadd.u16        q8,  q8,  q15
> +        vadd.u16        q9,  q9,  q15
> +.endif

This will stall waiting for q9.

> +        \vshrn          d16, q8,  #6
>          vld1.64         {d6, d7}, [r5], r4
>          pld             [r1]
> -        vrshrn.u16      d17, q9,  #6
> +        \vshrn          d17, q9,  #6
>  .ifc \type,avg
>          vld1.64         {d20}, [lr,:64], r2
>          vld1.64         {d21}, [lr,:64], r2
> @@ -135,8 +142,12 @@ function ff_\type\()_h264_chroma_mc8_neon, export=1
>          vmull.u8        q9,  d6,  d0
>          vmlal.u8        q9,  d4,  d1
>          vld1.64         {d6}, [r5], r4
> -        vrshrn.u16      d16, q8,  #6
> -        vrshrn.u16      d17, q9,  #6
> +.if \no_rnd
> +        vadd.u16        q8,  q8,  q15
> +        vadd.u16        q9,  q9,  q15
> +.endif
> +        \vshrn          d16, q8,  #6
> +        \vshrn          d17, q9,  #6

Ditto.

>  .ifc \type,avg
>          vld1.64         {d20}, [lr,:64], r2
>          vld1.64         {d21}, [lr,:64], r2
> @@ -162,10 +173,14 @@ function ff_\type\()_h264_chroma_mc8_neon, export=1
>          vld1.64         {d4, d5}, [r1], r2
>          vmull.u8        q9,  d6,  d0
>          vmlal.u8        q9,  d7,  d1
> +.if \no_rnd
> +        vadd.u16        q8,  q8,  q15
> +        vadd.u16        q9,  q9,  q15
> +.endif

Ditto.

Is there no way to move those adds down a bit?

-- 
M?ns Rullg?rd
mans at mansr.com