[FFmpeg-devel] [PATCH v2 2/2] lavc/aarch64: h264, add chroma loop filters for 10bit
Mikhail Nitenko
mnitenko at gmail.com
Mon Aug 16 12:45:45 EEST 2021
Benchmarks: A53 A72
h264_h_loop_filter_chroma422_10bpp_c: 277.5 114.2
h264_h_loop_filter_chroma422_10bpp_neon: 109.7 81.7
h264_h_loop_filter_chroma_10bpp_c: 165.0 75.5
h264_h_loop_filter_chroma_10bpp_neon: 121.2 74.7
h264_h_loop_filter_chroma_intra422_10bpp_c: 324.2 124.2
h264_h_loop_filter_chroma_intra422_10bpp_neon: 155.2 99.5
h264_h_loop_filter_chroma_intra_10bpp_c: 121.0 48.5
h264_h_loop_filter_chroma_intra_10bpp_neon: 79.5 52.7
h264_h_loop_filter_chroma_mbaff422_10bpp_c: 191.0 73.5
h264_h_loop_filter_chroma_mbaff422_10bpp_neon: 121.2 75.5
h264_h_loop_filter_chroma_mbaff_intra422_10bpp_c: 117.0 51.5
h264_h_loop_filter_chroma_mbaff_intra422_10bpp_neon: 79.5 53.7
h264_h_loop_filter_chroma_mbaff_intra_10bpp_c: 63.0 28.5
h264_h_loop_filter_chroma_mbaff_intra_10bpp_neon: 48.7 33.2
h264_v_loop_filter_chroma_10bpp_c: 260.2 135.5
h264_v_loop_filter_chroma_10bpp_neon: 72.2 49.2
h264_v_loop_filter_chroma_intra_10bpp_c: 158.0 70.7
h264_v_loop_filter_chroma_intra_10bpp_neon: 48.7 32.0
Signed-off-by: Mikhail Nitenko <mnitenko at gmail.com>
---
removed leftover code, moved from 32bit and started loading with two
alternating registers, code became quite a bit faster!
libavcodec/aarch64/h264dsp_init_aarch64.c | 37 ++++
libavcodec/aarch64/h264dsp_neon.S | 255 ++++++++++++++++++++++
2 files changed, 292 insertions(+)
diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c
index cbaf8d31eb..6bf3ecb8a1 100644
--- a/libavcodec/aarch64/h264dsp_init_aarch64.c
+++ b/libavcodec/aarch64/h264dsp_init_aarch64.c
@@ -83,6 +83,29 @@ void ff_h264_idct8_add4_neon(uint8_t *dst, const int *block_offset,
int16_t *block, int stride,
const uint8_t nnzc[5 * 8]);
+void ff_h264_v_loop_filter_luma_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha,
+ int beta, int8_t *tc0);
+void ff_h264_h_loop_filter_luma_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha,
+ int beta, int8_t *tc0);
+void ff_h264_v_loop_filter_luma_intra_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha,
+ int beta);
+void ff_h264_h_loop_filter_luma_intra_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha,
+ int beta);
+void ff_h264_v_loop_filter_chroma_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha,
+ int beta, int8_t *tc0);
+void ff_h264_h_loop_filter_chroma_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha,
+ int beta, int8_t *tc0);
+void ff_h264_h_loop_filter_chroma422_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha,
+ int beta, int8_t *tc0);
+void ff_h264_v_loop_filter_chroma_intra_neon_10(uint8_t *pix, ptrdiff_t stride,
+ int alpha, int beta);
+void ff_h264_h_loop_filter_chroma_intra_neon_10(uint8_t *pix, ptrdiff_t stride,
+ int alpha, int beta);
+void ff_h264_h_loop_filter_chroma422_intra_neon_10(uint8_t *pix, ptrdiff_t stride,
+ int alpha, int beta);
+void ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10(uint8_t *pix, ptrdiff_t stride,
+ int alpha, int beta);
+
av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth,
const int chroma_format_idc)
{
@@ -125,5 +148,19 @@ av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth,
c->h264_idct8_add = ff_h264_idct8_add_neon;
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_neon;
c->h264_idct8_add4 = ff_h264_idct8_add4_neon;
+ } else if (have_neon(cpu_flags) && bit_depth == 10) {
+ c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon_10;
+ c->h264_v_loop_filter_chroma_intra = ff_h264_v_loop_filter_chroma_intra_neon_10;
+
+ if (chroma_format_idc <= 1) {
+ c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon_10;
+ c->h264_h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma_intra_neon_10;
+ c->h264_h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10;
+ } else {
+ c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma422_neon_10;
+ c->h264_h_loop_filter_chroma_mbaff = ff_h264_h_loop_filter_chroma_neon_10;
+ c->h264_h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma422_intra_neon_10;
+ c->h264_h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_intra_neon_10;
+ }
}
}
diff --git a/libavcodec/aarch64/h264dsp_neon.S b/libavcodec/aarch64/h264dsp_neon.S
index 997082498f..80b7ed5ce1 100644
--- a/libavcodec/aarch64/h264dsp_neon.S
+++ b/libavcodec/aarch64/h264dsp_neon.S
@@ -819,3 +819,258 @@ endfunc
weight_func 16
weight_func 8
weight_func 4
+
+.macro h264_loop_filter_start_10
+ cmp w2, #0
+ ldr w6, [x4]
+ ccmp w3, #0, #0, ne
+ lsl w2, w2, #2
+ mov v24.S[0], w6
+ lsl w3, w3, #2
+ and w8, w6, w6, lsl #16
+ b.eq 1f
+ ands w8, w8, w8, lsl #8
+ b.ge 2f
+1:
+ ret
+2:
+.endm
+
+.macro h264_loop_filter_start_intra_10
+ orr w4, w2, w3
+ cbnz w4, 1f
+ ret
+1:
+ lsl w2, w2, #2
+ lsl w3, w3, #2
+ dup v30.8h, w2 // alpha
+ dup v31.8h, w3 // beta
+.endm
+
+.macro h264_loop_filter_chroma_10
+ dup v22.8h, w2 // alpha
+ dup v23.8h, w3 // beta
+ uxtl v24.8h, v24.8b // tc0
+
+ uabd v26.8h, v16.8h, v0.8h // abs(p0 - q0)
+ uabd v28.8h, v18.8h, v16.8h // abs(p1 - p0)
+ uabd v30.8h, v2.8h, v0.8h // abs(q1 - q0)
+ cmhi v26.8h, v22.8h, v26.8h // < alpha
+ cmhi v28.8h, v23.8h, v28.8h // < beta
+ cmhi v30.8h, v23.8h, v30.8h // < beta
+
+ and v26.16b, v26.16b, v28.16b
+ mov v4.16b, v0.16b
+ sub v4.8h, v4.8h, v16.8h
+ and v26.16b, v26.16b, v30.16b
+ shl v4.8h, v4.8h, #2
+ mov x8, v26.d[0]
+ mov x9, v26.d[1]
+ sli v24.8H, v24.8H, #8
+ uxtl v24.8H, v24.8B
+ add v4.8h, v4.8h, v18.8h
+ shl v24.8h, v24.8h, #2
+
+ adds x8, x8, x9
+ b.eq 9f
+
+ movi v31.8h, #3 // (tc0 - 1) << (BIT_DEPTH - 8)) + 1
+ uqsub v24.8h, v24.8h, v31.8h
+ sub v4.8h , v4.8h, v2.8h
+ srshr v4.8h, v4.8h, #3
+ smin v4.8h, v4.8h, v24.8h
+ neg v25.8h, v24.8h
+ smax v4.8h, v4.8h, v25.8h
+ and v4.16B, v4.16B, v26.16B
+ add v16.8h, v16.8h, v4.8h
+ sub v0.8h, v0.8h, v4.8h
+
+ mvni v4.8h, #0xFC, lsl #8 // 1023 for clipping
+ movi v5.8h, #0
+ smin v0.8h, v0.8h, v4.8h
+ smax v16.8h, v16.8h, v5.8h
+ smax v0.8h, v0.8h, v5.8h
+ smin v16.8h, v16.8h, v4.8h
+.endm
+
+function ff_h264_v_loop_filter_chroma_neon_10, export=1
+ h264_loop_filter_start_10
+
+ mov x10, x0
+ sub x0, x0, x1, lsl #1
+ ld1 {v18.8h}, [x0 ], x1
+ ld1 {v0.8h}, [x10], x1
+ ld1 {v16.8h}, [x0 ], x1
+ ld1 {v2.8h}, [x10]
+
+ h264_loop_filter_chroma_10
+
+ sub x0, x10, x1, lsl #1
+ st1 {v16.8h}, [x0], x1
+ st1 {v0.8h}, [x0], x1
+9:
+ ret
+endfunc
+
+function ff_h264_h_loop_filter_chroma_neon_10, export=1
+ h264_loop_filter_start_10
+
+ sub x0, x0, #4 // access the 2nd left pixel
+h_loop_filter_chroma420_10:
+ add x10, x0, x1, lsl #2
+ ld1 {v18.d}[0], [x0 ], x1
+ ld1 {v18.d}[1], [x10], x1
+ ld1 {v16.d}[0], [x0 ], x1
+ ld1 {v16.d}[1], [x10], x1
+ ld1 {v0.d}[0], [x0 ], x1
+ ld1 {v0.d}[1], [x10], x1
+ ld1 {v2.d}[0], [x0 ], x1
+ ld1 {v2.d}[1], [x10], x1
+
+ transpose_4x8H v18, v16, v0, v2, v28, v29, v30, v31
+
+ h264_loop_filter_chroma_10
+
+ transpose_4x8H v18, v16, v0, v2, v28, v29, v30, v31
+
+ sub x0, x10, x1, lsl #3
+ st1 {v18.d}[0], [x0], x1
+ st1 {v16.d}[0], [x0], x1
+ st1 {v0.d}[0], [x0], x1
+ st1 {v2.d}[0], [x0], x1
+ st1 {v18.d}[1], [x0], x1
+ st1 {v16.d}[1], [x0], x1
+ st1 {v0.d}[1], [x0], x1
+ st1 {v2.d}[1], [x0], x1
+9:
+ ret
+endfunc
+
+function ff_h264_h_loop_filter_chroma422_neon_10, export=1
+ h264_loop_filter_start_10
+ add x5, x0, x1
+ sub x0, x0, #4
+ add x1, x1, x1
+ mov x7, x30
+ bl h_loop_filter_chroma420_10
+ mov x30, x7
+ sub x0, x5, #4
+ mov v24.s[0], w6
+ b h_loop_filter_chroma420_10
+endfunc
+
+.macro h264_loop_filter_chroma_intra_10
+ uabd v26.8h, v16.8h, v17.8h // abs(p0 - q0)
+ uabd v27.8h, v18.8h, v16.8h // abs(p1 - p0)
+ uabd v28.8h, v19.8h, v17.8h // abs(q1 - q0)
+ cmhi v26.8h, v30.8h, v26.8h // < alpha
+ cmhi v27.8h, v31.8h, v27.8h // < beta
+ cmhi v28.8h, v31.8h, v28.8h // < beta
+ and v26.16b, v26.16b, v27.16b
+ and v26.16b, v26.16b, v28.16b
+ mov x2, v26.d[0]
+ mov x3, v26.d[1]
+
+ shl v4.8h, v18.8h, #1
+ shl v6.8h, v19.8h, #1
+
+ adds x2, x2, x3
+ b.eq 9f
+
+ add v20.8h, v16.8h, v19.8h
+ add v22.8h, v17.8h, v18.8h
+ add v20.8h, v20.8h, v4.8h
+ add v22.8h, v22.8h, v6.8h
+ urshr v24.8h, v20.8h, #2
+ urshr v25.8h, v22.8h, #2
+ bit v16.16b, v24.16b, v26.16b
+ bit v17.16b, v25.16b, v26.16b
+.endm
+
+function ff_h264_v_loop_filter_chroma_intra_neon_10, export=1
+ h264_loop_filter_start_intra_10
+ mov x9, x0
+ sub x0, x0, x1, lsl #1
+ ld1 {v18.8h}, [x0], x1
+ ld1 {v17.8h}, [x9], x1
+ ld1 {v16.8h}, [x0], x1
+ ld1 {v19.8h}, [x9]
+
+ h264_loop_filter_chroma_intra_10
+
+ sub x0, x9, x1, lsl #1
+ st1 {v16.8h}, [x0], x1
+ st1 {v17.8h}, [x0], x1
+
+9:
+ ret
+endfunc
+
+function ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10, export=1
+ h264_loop_filter_start_intra_10
+
+ sub x4, x0, #4
+ sub x0, x0, #2
+ add x9, x4, x1, lsl #1
+ ld1 {v18.8h}, [x4], x1
+ ld1 {v17.8h}, [x9], x1
+ ld1 {v16.8h}, [x4], x1
+ ld1 {v19.8h}, [x9], x1
+
+ transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29
+
+ h264_loop_filter_chroma_intra_10
+
+ st2 {v16.h,v17.h}[0], [x0], x1
+ st2 {v16.h,v17.h}[1], [x0], x1
+ st2 {v16.h,v17.h}[2], [x0], x1
+ st2 {v16.h,v17.h}[3], [x0], x1
+
+9:
+ ret
+endfunc
+
+function ff_h264_h_loop_filter_chroma_intra_neon_10, export=1
+ h264_loop_filter_start_intra_10
+ sub x4, x0, #4
+ sub x0, x0, #2
+h_loop_filter_chroma420_intra_10:
+ add x9, x4, x1, lsl #2
+ ld1 {v18.4h}, [x4], x1
+ ld1 {v18.d}[1], [x9], x1
+ ld1 {v16.4h}, [x4], x1
+ ld1 {v16.d}[1], [x9], x1
+ ld1 {v17.4h}, [x4], x1
+ ld1 {v17.d}[1], [x9], x1
+ ld1 {v19.4h}, [x4], x1
+ ld1 {v19.d}[1], [x9], x1
+
+ transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29
+
+ h264_loop_filter_chroma_intra_10
+
+ st2 {v16.h,v17.h}[0], [x0], x1
+ st2 {v16.h,v17.h}[1], [x0], x1
+ st2 {v16.h,v17.h}[2], [x0], x1
+ st2 {v16.h,v17.h}[3], [x0], x1
+ st2 {v16.h,v17.h}[4], [x0], x1
+ st2 {v16.h,v17.h}[5], [x0], x1
+ st2 {v16.h,v17.h}[6], [x0], x1
+ st2 {v16.h,v17.h}[7], [x0], x1
+
+9:
+ ret
+endfunc
+
+function ff_h264_h_loop_filter_chroma422_intra_neon_10, export=1
+ h264_loop_filter_start_intra_10
+ sub x4, x0, #4
+ add x5, x0, x1, lsl #3
+ sub x0, x0, #2
+ mov x7, x30
+ bl h_loop_filter_chroma420_intra_10
+ mov x4, x9
+ sub x0, x5, #2
+ mov x30, x7
+ b h_loop_filter_chroma420_intra_10
+endfunc
--
2.32.0
More information about the ffmpeg-devel
mailing list