[FFmpeg-devel] [PATCH 2/2] lavc/aarch64: h264, add chroma loop filters for 10bit

Thu Aug 5 01:15:57 EEST 2021

On Fri, 16 Jul 2021, Mikhail Nitenko wrote:

> Benchmarks:                                             A53    A72
> h264_h_loop_filter_chroma422_10bpp_c:                  293.0  116.7
> h264_h_loop_filter_chroma422_10bpp_neon:               283.7  126.2
> h264_h_loop_filter_chroma_10bpp_c:                     165.2   58.5
> h264_h_loop_filter_chroma_10bpp_neon:                   74.7   87.2
> h264_h_loop_filter_chroma_intra422_10bpp_c:            246.2  124.5
> h264_h_loop_filter_chroma_intra422_10bpp_neon:         178.7   70.0
> h264_h_loop_filter_chroma_intra_10bpp_c:               121.0   40.5
> h264_h_loop_filter_chroma_intra_10bpp_neon:             73.7   59.2
> h264_h_loop_filter_chroma_mbaff422_10bpp_c:            145.7   72.7
> h264_h_loop_filter_chroma_mbaff422_10bpp_neon:         151.7   87.2
> h264_h_loop_filter_chroma_mbaff_intra422_10bpp_c:      117.5   48.0
> h264_h_loop_filter_chroma_mbaff_intra422_10bpp_neon:    73.7   37.7
> h264_h_loop_filter_chroma_mbaff_intra_10bpp_c:          57.0   27.7
> h264_h_loop_filter_chroma_mbaff_intra_10bpp_neon:       81.7   50.7
> h264_h_loop_filter_luma_intra_8bpp_c:                  242.7  134.0
> h264_h_loop_filter_luma_intra_8bpp_neon:               100.7   53.5
> h264_v_loop_filter_chroma_10bpp_c:                     257.2  138.5
> h264_v_loop_filter_chroma_10bpp_neon:                   98.2   67.5
> h264_v_loop_filter_chroma_intra_10bpp_c:               158.0   76.2
> h264_v_loop_filter_chroma_intra_10bpp_neon:             62.7   36.5
>
> Signed-off-by: Mikhail Nitenko <mnitenko at gmail.com>
> ---
>
> this code is a bit slow, particularly the horizontal versions, so any
> suggestions are greatly appreciated!
>
> libavcodec/aarch64/h264dsp_init_aarch64.c |  29 +++
> libavcodec/aarch64/h264dsp_neon.S         | 299 ++++++++++++++++++++++
> 2 files changed, 328 insertions(+)
>
> diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c
> index d5baccf235..9ee9c11e15 100644
> --- a/libavcodec/aarch64/h264dsp_init_aarch64.c
> +++ b/libavcodec/aarch64/h264dsp_init_aarch64.c
> @@ -83,6 +83,21 @@ void ff_h264_idct8_add4_neon(uint8_t *dst, const int *block_offset,
>                              int16_t *block, int stride,
>                              const uint8_t nnzc[6*8]);
>
> +void ff_h264_v_loop_filter_chroma_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha,
> +                                          int beta, int8_t *tc0);
> +void ff_h264_h_loop_filter_chroma_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha,
> +                                          int beta, int8_t *tc0);
> +void ff_h264_h_loop_filter_chroma422_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha,
> +                                             int beta, int8_t *tc0);
> +void ff_h264_v_loop_filter_chroma_intra_neon_10(uint8_t *pix, ptrdiff_t stride,
> +                                                int alpha, int beta);
> +void ff_h264_h_loop_filter_chroma_intra_neon_10(uint8_t *pix, ptrdiff_t stride,
> +                                                int alpha, int beta);
> +void ff_h264_h_loop_filter_chroma422_intra_neon_10(uint8_t *pix, ptrdiff_t stride,
> +                                                   int alpha, int beta);
> +void ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10(uint8_t *pix, ptrdiff_t stride,
> +                                                      int alpha, int beta);
> +
> av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth,
>                                      const int chroma_format_idc)
> {
> @@ -125,5 +140,19 @@ av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth,
>         c->h264_idct8_add       = ff_h264_idct8_add_neon;
>         c->h264_idct8_dc_add    = ff_h264_idct8_dc_add_neon;
>         c->h264_idct8_add4      = ff_h264_idct8_add4_neon;
> +    } else if (have_neon(cpu_flags) && bit_depth == 10) {
> +        c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon_10;
> +        c->h264_v_loop_filter_chroma_intra = ff_h264_v_loop_filter_chroma_intra_neon_10;
> +
> +        if (chroma_format_idc <= 1) {
> +            c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon_10;
> +            c->h264_h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma_intra_neon_10;
> +            c->h264_h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10;
> +        } else {
> +            c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma422_neon_10;
> +            c->h264_h_loop_filter_chroma_mbaff = ff_h264_h_loop_filter_chroma_neon_10;
> +            c->h264_h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma422_intra_neon_10;
> +            c->h264_h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_intra_neon_10;
> +        }
>     }
> }
> diff --git a/libavcodec/aarch64/h264dsp_neon.S b/libavcodec/aarch64/h264dsp_neon.S
> index fbb8ecc463..92e5afa524 100644
> --- a/libavcodec/aarch64/h264dsp_neon.S
> +++ b/libavcodec/aarch64/h264dsp_neon.S
> @@ -827,3 +827,302 @@ endfunc
>         weight_func     16
>         weight_func     8
>         weight_func     4
> +
> +.macro  h264_loop_filter_start_10
> +        cmp             w2,  #0
> +        ldr             w6,  [x4]
> +        ccmp            w3,  #0, #0, ne
> +        lsl             w2, w2, #2               // shift needed for 10bit
> +        mov             v24.S[0], w6
> +        lsl             w3, w3, #2

Please indent the operand columns in the lsl instructions similarly to the 
rest

> +        and             w8,  w6,  w6,  lsl #16
> +        b.eq            1f
> +        cmp             w6,  #0
> +        b.eq            1f

This extra cmp and b.eq isn't present in the 8 bit version. Why is it 
needed here? I.e. is this a typo or legitimate?

> +        ands            w8,  w8,  w8,  lsl #8
> +        b.ge            2f
> +1:
> +        ret
> +2:
> +.endm
> +
> +.macro h264_loop_filter_start_intra_10
> +    orr             w4,  w2,  w3
> +    cbnz            w4,  1f
> +    ret
> +1:
> +    sxtw            x1,  w1

This sxtw (and all the other ones) are unneeded, as the stride argument is 
ptrdiff_t these days (it was int originally); I sent a separate patch to 
fix this for the existing 8 bit functions.

> +    lsl             w2, w2, #2                   // shift needed for 10bit
> +    lsl             w3, w3, #2                   // shift needed for 10bit
> +    dup             v30.8h, w2                   // alpha
> +    dup             v31.8h, w3                   // beta
> +.endm
> +
> +.macro  h264_loop_filter_chroma_10
> +        dup             v22.8h, w2               // alpha
> +        dup             v23.8h, w3               // beta
> +        uxtl            v24.8h, v24.8b           // tc0
> +
> +        uabd            v26.8h, v16.8h, v0.8h    // abs(p0 - q0)
> +        uabd            v28.8h, v18.8h, v16.8h   // abs(p1 - p0)
> +        uabd            v30.8h, v2.8h,  v0.8h    // abs(q1 - q0)
> +
> +        cmhi            v26.8h, v22.8h, v26.8h   // < alpha
> +        cmhi            v28.8h, v23.8h, v28.8h   // < beta
> +        cmhi            v30.8h, v23.8h, v30.8h   // < beta
> +
> +        uxtl            v4.4s,  v0.4h
> +        uxtl2           v5.4s,  v0.8h
> +
> +        and             v26.16b, v26.16b, v28.16b
> +
> +        usubw           v4.4s, v4.4s, v16.4h
> +        usubw2          v5.4s, v5.4s, v16.8h

I'm not sure if this step here requires being done with 32 bit elements; 
at least this far (the difference of two pixels), it only requires a 
signed 11 bit number, so it shouldn't require lengthening?

> +
> +        and             v26.16b, v26.16b, v30.16b
> +
> +        shl             v4.4s, v4.4s, #2
> +        shl             v5.4s, v5.4s, #2
> +
> +        mov             x8, v26.d[0]
> +        mov             x9, v26.d[1]
> +        orr             x8, x8, x9
> +
> +        sli             v24.8H, v24.8H, #8
> +        uxtl            v24.8H, v24.8B
> +        uaddw           v4.4s, v4.4s, v18.4h
> +        uaddw2          v5.4s, v5.4s, v18.8h     // add p1
> +
> +        cbz             x8, 9f
> +
> +        usubw           v4.4s, v4.4s, v2.4h
> +        usubw2          v5.4s, v5.4s, v2.8h      // sub q1
> +        rshrn           v4.4h, v4.4s, #3
> +        rshrn2          v4.8h, v5.4s, #3
> +
> +        mov             w8,  #1
> +        dup             v31.8h, w8               // this is actually important for higher depths, but not needed in 8 bit

You can use 'movi' instead of mov+dup. And instead of having a comment 
saying it's important, I'd rather see the comment explain what it does or 
why - you could e.g. have some comment refer to 'tc'

> +        sub             v24.8h, v24.8h, v31.8h
> +        shl             v24.8h, v24.8h, #2
> +        add             v24.8h, v24.8h, v31.8h

So this does v24 = ((v24 - 1) << 2) + 1. Isn't that equivalent to v24 = 
(v24 << 2) - 3?

> +        mov             w8,  #0
> +        dup             v31.8h,  w8
> +        smax            v24.8h,  v24.8h,  v31.8h // this all feels like a huge hack (needed to exclude neg values)

You might want to use 'usqadd' instead of the add/sub + smax here.

You could also start calculating v24 a bit earlier, interleaved with where 
v4/v5 is finished

> +
> +        smin            v4.8h,  v4.8h,  v24.8h
> +        neg             v25.8h, v24.8h
> +        smax            v4.8h,  v4.8h,  v25.8h
> +
> +        uxtl            v22.4s, v0.4h
> +        uxtl2           v23.4s, v0.8h

None of this should require 32 bit elements

> +
> +        and             v4.16B, v4.16B, v26.16B
> +
> +        uxtl            v28.4s, v16.4h
> +        uxtl2           v29.4s, v16.8h
> +
> +        saddw           v28.4s, v28.4s, v4.4h
> +        saddw2          v29.4s, v29.4s, v4.8h
> +
> +        ssubw           v22.4s, v22.4s, v4.4h
> +        ssubw2          v23.4s, v23.4s, v4.8h
> +
> +        sqxtun          v16.4h, v28.4s
> +        sqxtun2         v16.8h, v29.4s
> +
> +        sqxtun          v0.4h,  v22.4s
> +        sqxtun2         v0.8h,  v23.4s
> +
> +        mov             w2,  #1023               // for clipping
> +        dup             v4.8h,  w2

mvni #0xFC, lsl #8

And if possible, load your constants a bit earlier to avoid stalls

> +        smin            v0.8h,  v0.8h,  v4.8h
> +        smin            v16.8h,  v16.8h,  v4.8h
> +.endm
> +
> +function ff_h264_v_loop_filter_chroma_neon_10, export=1
> +        h264_loop_filter_start_10
> +        sxtw            x1,  w1
> +
> +        sub             x0,  x0,  x1, lsl #1
> +        ld1             {v18.8h}, [x0], x1
> +        ld1             {v16.8h}, [x0], x1
> +        ld1             {v0.8h},  [x0], x1
> +        ld1             {v2.8h},  [x0]
> +
> +        h264_loop_filter_chroma_10
> +
> +        sub             x0,  x0,  x1, lsl #1
> +        st1             {v16.8h}, [x0], x1
> +        st1             {v0.8h},  [x0], x1
> +9:
> +        ret
> +endfunc
> +
> +function ff_h264_h_loop_filter_chroma_neon_10, export=1
> +        h264_loop_filter_start_10
> +        sxtw            x1,  w1
> +
> +        sub             x0,  x0,  #4
> +h_loop_filter_chroma420_10:
> +        ld1             {v18.d}[0], [x0], x1
> +        ld1             {v16.d}[0], [x0], x1
> +        ld1             {v0.d}[0],  [x0], x1
> +        ld1             {v2.d}[0],  [x0], x1
> +        ld1             {v18.d}[1], [x0], x1
> +        ld1             {v16.d}[1], [x0], x1
> +        ld1             {v0.d}[1],  [x0], x1
> +        ld1             {v2.d}[1],  [x0], x1

If the horizontal versions of these functions are slow, I'd recommend try 
loading using two alternating registers, like is done in all the vp9 loop 
filter functions.

> +
> +        transpose_4x8H  v18, v16, v0, v2, v28, v29, v30, v31
> +
> +        h264_loop_filter_chroma_10
> +
> +        transpose_4x8H  v18, v16, v0, v2, v28, v29, v30, v31
> +
> +        sub             x0,  x0,  x1, lsl #3
> +        st1             {v18.d}[0], [x0], x1
> +        st1             {v16.d}[0], [x0], x1
> +        st1             {v0.d}[0],  [x0], x1
> +        st1             {v2.d}[0],  [x0], x1
> +        st1             {v18.d}[1], [x0], x1
> +        st1             {v16.d}[1], [x0], x1
> +        st1             {v0.d}[1],  [x0], x1
> +        st1             {v2.d}[1],  [x0], x1
> +9:
> +        ret
> +endfunc
> +
> +function ff_h264_h_loop_filter_chroma422_neon_10, export=1
> +        sxtw            x1,  w1
> +        h264_loop_filter_start_10
> +        add             x5,  x0,  x1
> +        sub             x0,  x0,  #4
> +        add             x1,  x1,  x1
> +        mov             x7,  x30
> +        bl              h_loop_filter_chroma420_10
> +        mov             x30, x7
> +        sub             x0,  x5,  #4
> +        mov             v24.s[0], w6
> +        b               h_loop_filter_chroma420_10
> +endfunc
> +
> +.macro h264_loop_filter_chroma_intra_10
> +    uabd            v26.8h, v16.8h, v17.8h       // abs(p0 - q0)
> +    uabd            v27.8h, v18.8h, v16.8h       // abs(p1 - p0)
> +    uabd            v28.8h, v19.8h, v17.8h       // abs(q1 - q0)

This uses a completely different indentation than the rest, please match 
the existing style. I'll send a patch to reindent these bits in the 
existing code.

> +    cmhi            v26.8h, v30.8h, v26.8h       // < alpha
> +    cmhi            v27.8h, v31.8h, v27.8h       // < beta
> +    cmhi            v28.8h, v31.8h, v28.8h       // < beta
> +    and             v26.16b, v26.16b, v27.16b
> +    and             v26.16b, v26.16b, v28.16b
> +    mov             x2, v26.d[0]
> +    mov             x3, v26.d[1]
> +    orr             x2,  x2,  x3
> +
> +    ushll           v4.4s,   v18.4h,  #1
> +    ushll2          v5.4s,   v18.8h,  #1

I'd recommend moving the 'orr' from above to here instead, to avoid 
stalls. You could also consider using 'adds' (there's no 'orrs', but for 
these cases it should work just fine) with b.eq instead of 'orr' + 'cbz'; 
see how that's done in the vp9 code.

And again, 32 bit elements shouldn't be needed here.

// Martin