[FFmpeg-devel] [PATCH v2 3/4] avcodec/aarch64/hevcdsp: add idct_dc NEON

Thu Feb 11 11:32:08 EET 2021

On Thu, 4 Feb 2021, Josh Dekker wrote:

> Signed-off-by: Josh Dekker <josh at itanimul.li>
> ---
> libavcodec/aarch64/hevcdsp_idct_neon.S    | 54 +++++++++++++++++++++++
> libavcodec/aarch64/hevcdsp_init_aarch64.c | 16 +++++++
> 2 files changed, 70 insertions(+)
>
> diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S
> index 329038a958..d3902a9e0f 100644
> --- a/libavcodec/aarch64/hevcdsp_idct_neon.S
> +++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
> @@ -5,6 +5,7 @@
>  *
>  * Ported from arm/hevcdsp_idct_neon.S by
>  * Copyright (c) 2020 Reimar Döffinger
> + * Copyright (c) 2020 Josh Dekker
>  *
>  * This file is part of FFmpeg.
>  *
> @@ -568,3 +569,56 @@ tr_16x4 secondpass_10, 20 - 10, 512, 1
> 
> idct_16x16 8
> idct_16x16 10
> +
> +// void ff_hevc_idct_NxN_dc_DEPTH_neon(int16_t *coeffs)
> +.macro idct_dc size bitdepth

This needs a comma between size and bitdepth. Normally, commas are 
optional, but when targeting darwin, commas between macro arguments (both 
in the declaration like this, and when invoking macros) are mandatory (due 
to backwards compatibility with a different macro syntax in legacy gas on 
darwin platforms, I think).

> +function ff_hevc_idct_\size\()x\size\()_dc_\bitdepth\()_neon, export=1
> +        ldrsh   w1, [x0]

The indentation of your new code differs from the rest of the existing 
code in the file

> +        mov     w2,  #(1 << (13 - \bitdepth))
> +        add     w1,  w1, #1
> +        asr     w1,  w1, #1
> +        add     w1,  w1, w2
> +        asr     w1,  w1, #(14 - \bitdepth)

The function is a bit slower than expected in some cases; on Cortex A72 
and A73, it's actually slower than what GCC produces for small block 
sizes:

                           Cortex A53     A72      A73
hevc_idct_4x4_dc_8_c:           25.5    11.7     14.7
hevc_idct_4x4_dc_8_neon:        15.5    12.5     15.5

However these 5 instructions above can be replaced with just these three:

         mov     w2,  #((1 << (14 - \bitdepth))+1)
         add     w1,  w1, w2
         asr     w1,  w1, #(15 - \bitdepth)

With that change, I get it down to this:

hevc_idct_4x4_dc_8_neon:        12.7    10.2     13.5

So then it's universally faster than the C code at least.

A different alternative is this:

         movi    v1.8h,  #((1 << (14 - \bitdepth))+1)
         ld1r    {v4.8h}, [x0]
         add     v4.8h,  v4.8h,  v1.8h
         sshr    v0.8h,  v4.8h,  #(15 - \bitdepth)
         sshr    v1.8h,  v4.8h,  #(15 - \bitdepth)
.if \size > 4
         sshr    v2.8h,  v4.8h,  #(15 - \bitdepth)
         sshr    v3.8h,  v4.8h,  #(15 - \bitdepth)

(The reason for doing 4 sshr instructions, instead of just doing 1 
followed by 3 dups, is that this allows all of them to start possibly in 
parallel, as soon as the result of the add is available, instead of having 
a second dup instruction waiting for the result from the sshr.)

That produces the following numbers:

hevc_idct_4x4_dc_8_neon:        12.7    11.5      9.2

I.e. a tiny bit slower than the previous on A72, and faster on A73, but 
also a viable alternative.

One could also consider this prologue:

         ld1r    {v4.8h}, [x0]
         srshr   v4.8h,  v4.8h,  #1
         srshr   v0.8h,  v4.8h,  #(14 - \bitdepth)
         srshr   v1.8h,  v4.8h,  #(14 - \bitdepth)
.if \size > 4
         srshr   v2.8h,  v4.8h,  #(14 - \bitdepth)
         srshr   v3.8h,  v4.8h,  #(14 - \bitdepth)

But that's a worse combination overall:

hevc_idct_4x4_dc_8_neon:        13.5    12.5     10.2

> +        dup  v0.8h,  w1
> +        dup  v1.8h,  w1
> +.if \size > 4
> +        dup  v2.8h,  w1
> +        dup  v3.8h,  w1
> +.if \size > 16 /* dc 32x32 */
> +        mov     x2,  #4
> +1:
> +        subs    x2,  x2, #1
> +.endif
> +        add    x12,  x0,  #64
> +        mov    x13,  #128
> +.if \size > 8 /* dc 16x16 */
> +        st1   {v0.8h-v3.8h}, [ x0], x13
> +        st1   {v0.8h-v3.8h}, [x12], x13
> +        st1   {v0.8h-v3.8h}, [ x0], x13
> +        st1   {v0.8h-v3.8h}, [x12], x13
> +        st1   {v0.8h-v3.8h}, [ x0], x13
> +        st1   {v0.8h-v3.8h}, [x12], x13
> +.endif /* dc 8x8 */
> +        st1   {v0.8h-v3.8h}, [ x0], x13
> +        st1   {v0.8h-v3.8h}, [x12], x13
> +.if \size > 16 /* dc 32x32 */
> +        bne 1b
> +.endif
> +.else /* dc 4x4 */
> +        st1   {v0.8h-v1.8h}, [x0]
> +.endif
> +        ret
> +endfunc
> +.endm
> +
> +idct_dc 4 8
> +idct_dc 4 10
> +
> +idct_dc 8 8
> +idct_dc 8 10
> +
> +idct_dc 16 8
> +idct_dc 16 10
> +
> +idct_dc 32 8
> +idct_dc 32 10

Missing commas between the macro arguments.

// Martin