[FFmpeg-devel] [PATCH 2/2] lavc/aarch64: h264, add chroma loop filters for 10bit
Martin Storsjö
martin at martin.st
Thu Aug 5 01:15:57 EEST 2021
On Fri, 16 Jul 2021, Mikhail Nitenko wrote:
> Benchmarks: A53 A72
> h264_h_loop_filter_chroma422_10bpp_c: 293.0 116.7
> h264_h_loop_filter_chroma422_10bpp_neon: 283.7 126.2
> h264_h_loop_filter_chroma_10bpp_c: 165.2 58.5
> h264_h_loop_filter_chroma_10bpp_neon: 74.7 87.2
> h264_h_loop_filter_chroma_intra422_10bpp_c: 246.2 124.5
> h264_h_loop_filter_chroma_intra422_10bpp_neon: 178.7 70.0
> h264_h_loop_filter_chroma_intra_10bpp_c: 121.0 40.5
> h264_h_loop_filter_chroma_intra_10bpp_neon: 73.7 59.2
> h264_h_loop_filter_chroma_mbaff422_10bpp_c: 145.7 72.7
> h264_h_loop_filter_chroma_mbaff422_10bpp_neon: 151.7 87.2
> h264_h_loop_filter_chroma_mbaff_intra422_10bpp_c: 117.5 48.0
> h264_h_loop_filter_chroma_mbaff_intra422_10bpp_neon: 73.7 37.7
> h264_h_loop_filter_chroma_mbaff_intra_10bpp_c: 57.0 27.7
> h264_h_loop_filter_chroma_mbaff_intra_10bpp_neon: 81.7 50.7
> h264_h_loop_filter_luma_intra_8bpp_c: 242.7 134.0
> h264_h_loop_filter_luma_intra_8bpp_neon: 100.7 53.5
> h264_v_loop_filter_chroma_10bpp_c: 257.2 138.5
> h264_v_loop_filter_chroma_10bpp_neon: 98.2 67.5
> h264_v_loop_filter_chroma_intra_10bpp_c: 158.0 76.2
> h264_v_loop_filter_chroma_intra_10bpp_neon: 62.7 36.5
>
> Signed-off-by: Mikhail Nitenko <mnitenko at gmail.com>
> ---
>
> this code is a bit slow, particularly the horizontal versions, so any
> suggestions are greatly appreciated!
>
> libavcodec/aarch64/h264dsp_init_aarch64.c | 29 +++
> libavcodec/aarch64/h264dsp_neon.S | 299 ++++++++++++++++++++++
> 2 files changed, 328 insertions(+)
>
> diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c
> index d5baccf235..9ee9c11e15 100644
> --- a/libavcodec/aarch64/h264dsp_init_aarch64.c
> +++ b/libavcodec/aarch64/h264dsp_init_aarch64.c
> @@ -83,6 +83,21 @@ void ff_h264_idct8_add4_neon(uint8_t *dst, const int *block_offset,
> int16_t *block, int stride,
> const uint8_t nnzc[6*8]);
>
> +void ff_h264_v_loop_filter_chroma_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha,
> + int beta, int8_t *tc0);
> +void ff_h264_h_loop_filter_chroma_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha,
> + int beta, int8_t *tc0);
> +void ff_h264_h_loop_filter_chroma422_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha,
> + int beta, int8_t *tc0);
> +void ff_h264_v_loop_filter_chroma_intra_neon_10(uint8_t *pix, ptrdiff_t stride,
> + int alpha, int beta);
> +void ff_h264_h_loop_filter_chroma_intra_neon_10(uint8_t *pix, ptrdiff_t stride,
> + int alpha, int beta);
> +void ff_h264_h_loop_filter_chroma422_intra_neon_10(uint8_t *pix, ptrdiff_t stride,
> + int alpha, int beta);
> +void ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10(uint8_t *pix, ptrdiff_t stride,
> + int alpha, int beta);
> +
> av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth,
> const int chroma_format_idc)
> {
> @@ -125,5 +140,19 @@ av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth,
> c->h264_idct8_add = ff_h264_idct8_add_neon;
> c->h264_idct8_dc_add = ff_h264_idct8_dc_add_neon;
> c->h264_idct8_add4 = ff_h264_idct8_add4_neon;
> + } else if (have_neon(cpu_flags) && bit_depth == 10) {
> + c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon_10;
> + c->h264_v_loop_filter_chroma_intra = ff_h264_v_loop_filter_chroma_intra_neon_10;
> +
> + if (chroma_format_idc <= 1) {
> + c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon_10;
> + c->h264_h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma_intra_neon_10;
> + c->h264_h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10;
> + } else {
> + c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma422_neon_10;
> + c->h264_h_loop_filter_chroma_mbaff = ff_h264_h_loop_filter_chroma_neon_10;
> + c->h264_h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma422_intra_neon_10;
> + c->h264_h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_intra_neon_10;
> + }
> }
> }
> diff --git a/libavcodec/aarch64/h264dsp_neon.S b/libavcodec/aarch64/h264dsp_neon.S
> index fbb8ecc463..92e5afa524 100644
> --- a/libavcodec/aarch64/h264dsp_neon.S
> +++ b/libavcodec/aarch64/h264dsp_neon.S
> @@ -827,3 +827,302 @@ endfunc
> weight_func 16
> weight_func 8
> weight_func 4
> +
> +.macro h264_loop_filter_start_10
> + cmp w2, #0
> + ldr w6, [x4]
> + ccmp w3, #0, #0, ne
> + lsl w2, w2, #2 // shift needed for 10bit
> + mov v24.S[0], w6
> + lsl w3, w3, #2
Please indent the operand columns in the lsl instructions similarly to the
rest
> + and w8, w6, w6, lsl #16
> + b.eq 1f
> + cmp w6, #0
> + b.eq 1f
This extra cmp and b.eq isn't present in the 8 bit version. Why is it
needed here? I.e. is this a typo or legitimate?
> + ands w8, w8, w8, lsl #8
> + b.ge 2f
> +1:
> + ret
> +2:
> +.endm
> +
> +.macro h264_loop_filter_start_intra_10
> + orr w4, w2, w3
> + cbnz w4, 1f
> + ret
> +1:
> + sxtw x1, w1
This sxtw (and all the other ones) are unneeded, as the stride argument is
ptrdiff_t these days (it was int originally); I sent a separate patch to
fix this for the existing 8 bit functions.
> + lsl w2, w2, #2 // shift needed for 10bit
> + lsl w3, w3, #2 // shift needed for 10bit
> + dup v30.8h, w2 // alpha
> + dup v31.8h, w3 // beta
> +.endm
> +
> +.macro h264_loop_filter_chroma_10
> + dup v22.8h, w2 // alpha
> + dup v23.8h, w3 // beta
> + uxtl v24.8h, v24.8b // tc0
> +
> + uabd v26.8h, v16.8h, v0.8h // abs(p0 - q0)
> + uabd v28.8h, v18.8h, v16.8h // abs(p1 - p0)
> + uabd v30.8h, v2.8h, v0.8h // abs(q1 - q0)
> +
> + cmhi v26.8h, v22.8h, v26.8h // < alpha
> + cmhi v28.8h, v23.8h, v28.8h // < beta
> + cmhi v30.8h, v23.8h, v30.8h // < beta
> +
> + uxtl v4.4s, v0.4h
> + uxtl2 v5.4s, v0.8h
> +
> + and v26.16b, v26.16b, v28.16b
> +
> + usubw v4.4s, v4.4s, v16.4h
> + usubw2 v5.4s, v5.4s, v16.8h
I'm not sure if this step here requires being done with 32 bit elements;
at least this far (the difference of two pixels), it only requires a
signed 11 bit number, so it shouldn't require lengthening?
> +
> + and v26.16b, v26.16b, v30.16b
> +
> + shl v4.4s, v4.4s, #2
> + shl v5.4s, v5.4s, #2
> +
> + mov x8, v26.d[0]
> + mov x9, v26.d[1]
> + orr x8, x8, x9
> +
> + sli v24.8H, v24.8H, #8
> + uxtl v24.8H, v24.8B
> + uaddw v4.4s, v4.4s, v18.4h
> + uaddw2 v5.4s, v5.4s, v18.8h // add p1
> +
> + cbz x8, 9f
> +
> + usubw v4.4s, v4.4s, v2.4h
> + usubw2 v5.4s, v5.4s, v2.8h // sub q1
> + rshrn v4.4h, v4.4s, #3
> + rshrn2 v4.8h, v5.4s, #3
> +
> + mov w8, #1
> + dup v31.8h, w8 // this is actually important for higher depths, but not needed in 8 bit
You can use 'movi' instead of mov+dup. And instead of having a comment
saying it's important, I'd rather see the comment explain what it does or
why - you could e.g. have some comment refer to 'tc'
> + sub v24.8h, v24.8h, v31.8h
> + shl v24.8h, v24.8h, #2
> + add v24.8h, v24.8h, v31.8h
So this does v24 = ((v24 - 1) << 2) + 1. Isn't that equivalent to v24 =
(v24 << 2) - 3?
> + mov w8, #0
> + dup v31.8h, w8
> + smax v24.8h, v24.8h, v31.8h // this all feels like a huge hack (needed to exclude neg values)
You might want to use 'usqadd' instead of the add/sub + smax here.
You could also start calculating v24 a bit earlier, interleaved with where
v4/v5 is finished
> +
> + smin v4.8h, v4.8h, v24.8h
> + neg v25.8h, v24.8h
> + smax v4.8h, v4.8h, v25.8h
> +
> + uxtl v22.4s, v0.4h
> + uxtl2 v23.4s, v0.8h
None of this should require 32 bit elements
> +
> + and v4.16B, v4.16B, v26.16B
> +
> + uxtl v28.4s, v16.4h
> + uxtl2 v29.4s, v16.8h
> +
> + saddw v28.4s, v28.4s, v4.4h
> + saddw2 v29.4s, v29.4s, v4.8h
> +
> + ssubw v22.4s, v22.4s, v4.4h
> + ssubw2 v23.4s, v23.4s, v4.8h
> +
> + sqxtun v16.4h, v28.4s
> + sqxtun2 v16.8h, v29.4s
> +
> + sqxtun v0.4h, v22.4s
> + sqxtun2 v0.8h, v23.4s
> +
> + mov w2, #1023 // for clipping
> + dup v4.8h, w2
mvni #0xFC, lsl #8
And if possible, load your constants a bit earlier to avoid stalls
> + smin v0.8h, v0.8h, v4.8h
> + smin v16.8h, v16.8h, v4.8h
> +.endm
> +
> +function ff_h264_v_loop_filter_chroma_neon_10, export=1
> + h264_loop_filter_start_10
> + sxtw x1, w1
> +
> + sub x0, x0, x1, lsl #1
> + ld1 {v18.8h}, [x0], x1
> + ld1 {v16.8h}, [x0], x1
> + ld1 {v0.8h}, [x0], x1
> + ld1 {v2.8h}, [x0]
> +
> + h264_loop_filter_chroma_10
> +
> + sub x0, x0, x1, lsl #1
> + st1 {v16.8h}, [x0], x1
> + st1 {v0.8h}, [x0], x1
> +9:
> + ret
> +endfunc
> +
> +function ff_h264_h_loop_filter_chroma_neon_10, export=1
> + h264_loop_filter_start_10
> + sxtw x1, w1
> +
> + sub x0, x0, #4
> +h_loop_filter_chroma420_10:
> + ld1 {v18.d}[0], [x0], x1
> + ld1 {v16.d}[0], [x0], x1
> + ld1 {v0.d}[0], [x0], x1
> + ld1 {v2.d}[0], [x0], x1
> + ld1 {v18.d}[1], [x0], x1
> + ld1 {v16.d}[1], [x0], x1
> + ld1 {v0.d}[1], [x0], x1
> + ld1 {v2.d}[1], [x0], x1
If the horizontal versions of these functions are slow, I'd recommend try
loading using two alternating registers, like is done in all the vp9 loop
filter functions.
> +
> + transpose_4x8H v18, v16, v0, v2, v28, v29, v30, v31
> +
> + h264_loop_filter_chroma_10
> +
> + transpose_4x8H v18, v16, v0, v2, v28, v29, v30, v31
> +
> + sub x0, x0, x1, lsl #3
> + st1 {v18.d}[0], [x0], x1
> + st1 {v16.d}[0], [x0], x1
> + st1 {v0.d}[0], [x0], x1
> + st1 {v2.d}[0], [x0], x1
> + st1 {v18.d}[1], [x0], x1
> + st1 {v16.d}[1], [x0], x1
> + st1 {v0.d}[1], [x0], x1
> + st1 {v2.d}[1], [x0], x1
> +9:
> + ret
> +endfunc
> +
> +function ff_h264_h_loop_filter_chroma422_neon_10, export=1
> + sxtw x1, w1
> + h264_loop_filter_start_10
> + add x5, x0, x1
> + sub x0, x0, #4
> + add x1, x1, x1
> + mov x7, x30
> + bl h_loop_filter_chroma420_10
> + mov x30, x7
> + sub x0, x5, #4
> + mov v24.s[0], w6
> + b h_loop_filter_chroma420_10
> +endfunc
> +
> +.macro h264_loop_filter_chroma_intra_10
> + uabd v26.8h, v16.8h, v17.8h // abs(p0 - q0)
> + uabd v27.8h, v18.8h, v16.8h // abs(p1 - p0)
> + uabd v28.8h, v19.8h, v17.8h // abs(q1 - q0)
This uses a completely different indentation than the rest, please match
the existing style. I'll send a patch to reindent these bits in the
existing code.
> + cmhi v26.8h, v30.8h, v26.8h // < alpha
> + cmhi v27.8h, v31.8h, v27.8h // < beta
> + cmhi v28.8h, v31.8h, v28.8h // < beta
> + and v26.16b, v26.16b, v27.16b
> + and v26.16b, v26.16b, v28.16b
> + mov x2, v26.d[0]
> + mov x3, v26.d[1]
> + orr x2, x2, x3
> +
> + ushll v4.4s, v18.4h, #1
> + ushll2 v5.4s, v18.8h, #1
I'd recommend moving the 'orr' from above to here instead, to avoid
stalls. You could also consider using 'adds' (there's no 'orrs', but for
these cases it should work just fine) with b.eq instead of 'orr' + 'cbz';
see how that's done in the vp9 code.
And again, 32 bit elements shouldn't be needed here.
// Martin
More information about the ffmpeg-devel
mailing list