[FFmpeg-cvslog] h264/aarch64: add intra loop filter neon asm
Janne Grunau
git at videolan.org
Wed Feb 20 20:46:52 EET 2019
ffmpeg | branch: master | Janne Grunau <janne-libav at jannau.net> | Mon Aug 13 20:43:19 2018 +0200| [28a8b5413b64b831dfb8650208bccd8b78360484] | committer: Janne Grunau
h264/aarch64: add intra loop filter neon asm
Add my neon asm from x264 relicensed under the LGPL 2.1 or later. Ported
(x264 uses nv12 chroma) and optimized.
Cycle count for checkasm --bench on a Snapdragon 820e:
h264_h_loop_filter_luma_intra_8bpp_c: 60.0
h264_h_loop_filter_luma_intra_8bpp_neon: 54.2
h264_v_loop_filter_luma_intra_8bpp_c: 148.3
h264_v_loop_filter_luma_intra_8bpp_neon: 73.8
h264_h_loop_filter_chroma_intra_8bpp_c: 27.8
h264_h_loop_filter_chroma_intra_8bpp_neon: 21.4
h264_h_loop_filter_chroma_mbaff_intra_8bpp_c: 15.8
h264_h_loop_filter_chroma_mbaff_intra_8bpp_neon: 15.7
h264_v_loop_filter_chroma_intra_8bpp_c: 45.8
h264_v_loop_filter_chroma_intra_8bpp_neon: 17.3
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=28a8b5413b64b831dfb8650208bccd8b78360484
---
libavcodec/aarch64/h264dsp_init_aarch64.c | 16 ++
libavcodec/aarch64/h264dsp_neon.S | 297 ++++++++++++++++++++++++++++++
2 files changed, 313 insertions(+)
diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c
index b106f11134..07bda2ff07 100644
--- a/libavcodec/aarch64/h264dsp_init_aarch64.c
+++ b/libavcodec/aarch64/h264dsp_init_aarch64.c
@@ -29,10 +29,20 @@ void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
int beta, int8_t *tc0);
void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
int beta, int8_t *tc0);
+void ff_h264_v_loop_filter_luma_intra_neon(uint8_t *pix, int stride, int alpha,
+ int beta);
+void ff_h264_h_loop_filter_luma_intra_neon(uint8_t *pix, int stride, int alpha,
+ int beta);
void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
int beta, int8_t *tc0);
void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
int beta, int8_t *tc0);
+void ff_h264_v_loop_filter_chroma_intra_neon(uint8_t *pix, int stride,
+ int alpha, int beta);
+void ff_h264_h_loop_filter_chroma_intra_neon(uint8_t *pix, int stride,
+ int alpha, int beta);
+void ff_h264_h_loop_filter_chroma_mbaff_intra_neon(uint8_t *pix, int stride,
+ int alpha, int beta);
void ff_weight_h264_pixels_16_neon(uint8_t *dst, int stride, int height,
int log2_den, int weight, int offset);
@@ -77,8 +87,14 @@ av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth,
if (have_neon(cpu_flags) && bit_depth == 8) {
c->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_neon;
c->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_neon;
+ c->h264_v_loop_filter_luma_intra= ff_h264_v_loop_filter_luma_intra_neon;
+ c->h264_h_loop_filter_luma_intra= ff_h264_h_loop_filter_luma_intra_neon;
+
c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
+ c->h264_v_loop_filter_chroma_intra = ff_h264_v_loop_filter_chroma_intra_neon;
+ c->h264_h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma_intra_neon;
+ c->h264_h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_mbaff_intra_neon;
c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon;
c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon;
diff --git a/libavcodec/aarch64/h264dsp_neon.S b/libavcodec/aarch64/h264dsp_neon.S
index b649f1d018..448e575b8c 100644
--- a/libavcodec/aarch64/h264dsp_neon.S
+++ b/libavcodec/aarch64/h264dsp_neon.S
@@ -1,6 +1,7 @@
/*
* Copyright (c) 2008 Mans Rullgard <mans at mansr.com>
* Copyright (c) 2013 Janne Grunau <janne-libav at jannau.net>
+ * Copyright (c) 2014 Janne Grunau <janne-libav at jannau.net>
*
* This file is part of Libav.
*
@@ -181,6 +182,203 @@ function ff_h264_h_loop_filter_luma_neon, export=1
ret
endfunc
+
+.macro h264_loop_filter_start_intra
+ orr w4, w2, w3
+ cbnz w4, 1f
+ ret
+1:
+ sxtw x1, w1
+ dup v30.16b, w2 // alpha
+ dup v31.16b, w3 // beta
+.endm
+
+.macro h264_loop_filter_luma_intra
+ uabd v16.16b, v7.16b, v0.16b // abs(p0 - q0)
+ uabd v17.16b, v6.16b, v7.16b // abs(p1 - p0)
+ uabd v18.16b, v1.16b, v0.16b // abs(q1 - q0)
+ cmhi v19.16b, v30.16b, v16.16b // < alpha
+ cmhi v17.16b, v31.16b, v17.16b // < beta
+ cmhi v18.16b, v31.16b, v18.16b // < beta
+
+ movi v29.16b, #2
+ ushr v30.16b, v30.16b, #2 // alpha >> 2
+ add v30.16b, v30.16b, v29.16b // (alpha >> 2) + 2
+ cmhi v16.16b, v30.16b, v16.16b // < (alpha >> 2) + 2
+
+ and v19.16b, v19.16b, v17.16b
+ and v19.16b, v19.16b, v18.16b
+ shrn v20.8b, v19.8h, #4
+ mov x4, v20.d[0]
+ cbz x4, 9f
+
+ ushll v20.8h, v6.8b, #1
+ ushll v22.8h, v1.8b, #1
+ ushll2 v21.8h, v6.16b, #1
+ ushll2 v23.8h, v1.16b, #1
+ uaddw v20.8h, v20.8h, v7.8b
+ uaddw v22.8h, v22.8h, v0.8b
+ uaddw2 v21.8h, v21.8h, v7.16b
+ uaddw2 v23.8h, v23.8h, v0.16b
+ uaddw v20.8h, v20.8h, v1.8b
+ uaddw v22.8h, v22.8h, v6.8b
+ uaddw2 v21.8h, v21.8h, v1.16b
+ uaddw2 v23.8h, v23.8h, v6.16b
+
+ rshrn v24.8b, v20.8h, #2 // p0'_1
+ rshrn v25.8b, v22.8h, #2 // q0'_1
+ rshrn2 v24.16b, v21.8h, #2 // p0'_1
+ rshrn2 v25.16b, v23.8h, #2 // q0'_1
+
+ uabd v17.16b, v5.16b, v7.16b // abs(p2 - p0)
+ uabd v18.16b, v2.16b, v0.16b // abs(q2 - q0)
+ cmhi v17.16b, v31.16b, v17.16b // < beta
+ cmhi v18.16b, v31.16b, v18.16b // < beta
+
+ and v17.16b, v16.16b, v17.16b // if_2 && if_3
+ and v18.16b, v16.16b, v18.16b // if_2 && if_4
+
+ not v30.16b, v17.16b
+ not v31.16b, v18.16b
+
+ and v30.16b, v30.16b, v19.16b // if_1 && !(if_2 && if_3)
+ and v31.16b, v31.16b, v19.16b // if_1 && !(if_2 && if_4)
+
+ and v17.16b, v19.16b, v17.16b // if_1 && if_2 && if_3
+ and v18.16b, v19.16b, v18.16b // if_1 && if_2 && if_4
+
+ //calc p, v7, v6, v5, v4, v17, v7, v6, v5, v4
+ uaddl v26.8h, v5.8b, v7.8b
+ uaddl2 v27.8h, v5.16b, v7.16b
+ uaddw v26.8h, v26.8h, v0.8b
+ uaddw2 v27.8h, v27.8h, v0.16b
+ add v20.8h, v20.8h, v26.8h
+ add v21.8h, v21.8h, v27.8h
+ uaddw v20.8h, v20.8h, v0.8b
+ uaddw2 v21.8h, v21.8h, v0.16b
+ rshrn v20.8b, v20.8h, #3 // p0'_2
+ rshrn2 v20.16b, v21.8h, #3 // p0'_2
+ uaddw v26.8h, v26.8h, v6.8b
+ uaddw2 v27.8h, v27.8h, v6.16b
+ rshrn v21.8b, v26.8h, #2 // p1'_2
+ rshrn2 v21.16b, v27.8h, #2 // p1'_2
+ uaddl v28.8h, v4.8b, v5.8b
+ uaddl2 v29.8h, v4.16b, v5.16b
+ shl v28.8h, v28.8h, #1
+ shl v29.8h, v29.8h, #1
+ add v28.8h, v28.8h, v26.8h
+ add v29.8h, v29.8h, v27.8h
+ rshrn v19.8b, v28.8h, #3 // p2'_2
+ rshrn2 v19.16b, v29.8h, #3 // p2'_2
+
+ //calc q, v0, v1, v2, v3, v18, v0, v1, v2, v3
+ uaddl v26.8h, v2.8b, v0.8b
+ uaddl2 v27.8h, v2.16b, v0.16b
+ uaddw v26.8h, v26.8h, v7.8b
+ uaddw2 v27.8h, v27.8h, v7.16b
+ add v22.8h, v22.8h, v26.8h
+ add v23.8h, v23.8h, v27.8h
+ uaddw v22.8h, v22.8h, v7.8b
+ uaddw2 v23.8h, v23.8h, v7.16b
+ rshrn v22.8b, v22.8h, #3 // q0'_2
+ rshrn2 v22.16b, v23.8h, #3 // q0'_2
+ uaddw v26.8h, v26.8h, v1.8b
+ uaddw2 v27.8h, v27.8h, v1.16b
+ rshrn v23.8b, v26.8h, #2 // q1'_2
+ rshrn2 v23.16b, v27.8h, #2 // q1'_2
+ uaddl v28.8h, v2.8b, v3.8b
+ uaddl2 v29.8h, v2.16b, v3.16b
+ shl v28.8h, v28.8h, #1
+ shl v29.8h, v29.8h, #1
+ add v28.8h, v28.8h, v26.8h
+ add v29.8h, v29.8h, v27.8h
+ rshrn v26.8b, v28.8h, #3 // q2'_2
+ rshrn2 v26.16b, v29.8h, #3 // q2'_2
+
+ bit v7.16b, v24.16b, v30.16b // p0'_1
+ bit v0.16b, v25.16b, v31.16b // q0'_1
+ bit v7.16b, v20.16b, v17.16b // p0'_2
+ bit v6.16b, v21.16b, v17.16b // p1'_2
+ bit v5.16b, v19.16b, v17.16b // p2'_2
+ bit v0.16b, v22.16b, v18.16b // q0'_2
+ bit v1.16b, v23.16b, v18.16b // q1'_2
+ bit v2.16b, v26.16b, v18.16b // q2'_2
+.endm
+
+function ff_h264_v_loop_filter_luma_intra_neon, export=1
+ h264_loop_filter_start_intra
+
+ ld1 {v0.16b}, [x0], x1 // q0
+ ld1 {v1.16b}, [x0], x1 // q1
+ ld1 {v2.16b}, [x0], x1 // q2
+ ld1 {v3.16b}, [x0], x1 // q3
+ sub x0, x0, x1, lsl #3
+ ld1 {v4.16b}, [x0], x1 // p3
+ ld1 {v5.16b}, [x0], x1 // p2
+ ld1 {v6.16b}, [x0], x1 // p1
+ ld1 {v7.16b}, [x0] // p0
+
+ h264_loop_filter_luma_intra
+
+ sub x0, x0, x1, lsl #1
+ st1 {v5.16b}, [x0], x1 // p2
+ st1 {v6.16b}, [x0], x1 // p1
+ st1 {v7.16b}, [x0], x1 // p0
+ st1 {v0.16b}, [x0], x1 // q0
+ st1 {v1.16b}, [x0], x1 // q1
+ st1 {v2.16b}, [x0] // q2
+9:
+ ret
+endfunc
+
+function ff_h264_h_loop_filter_luma_intra_neon, export=1
+ h264_loop_filter_start_intra
+
+ sub x0, x0, #4
+ ld1 {v4.8b}, [x0], x1
+ ld1 {v5.8b}, [x0], x1
+ ld1 {v6.8b}, [x0], x1
+ ld1 {v7.8b}, [x0], x1
+ ld1 {v0.8b}, [x0], x1
+ ld1 {v1.8b}, [x0], x1
+ ld1 {v2.8b}, [x0], x1
+ ld1 {v3.8b}, [x0], x1
+ ld1 {v4.d}[1], [x0], x1
+ ld1 {v5.d}[1], [x0], x1
+ ld1 {v6.d}[1], [x0], x1
+ ld1 {v7.d}[1], [x0], x1
+ ld1 {v0.d}[1], [x0], x1
+ ld1 {v1.d}[1], [x0], x1
+ ld1 {v2.d}[1], [x0], x1
+ ld1 {v3.d}[1], [x0], x1
+
+ transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
+
+ h264_loop_filter_luma_intra
+
+ transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
+
+ sub x0, x0, x1, lsl #4
+ st1 {v4.8b}, [x0], x1
+ st1 {v5.8b}, [x0], x1
+ st1 {v6.8b}, [x0], x1
+ st1 {v7.8b}, [x0], x1
+ st1 {v0.8b}, [x0], x1
+ st1 {v1.8b}, [x0], x1
+ st1 {v2.8b}, [x0], x1
+ st1 {v3.8b}, [x0], x1
+ st1 {v4.d}[1], [x0], x1
+ st1 {v5.d}[1], [x0], x1
+ st1 {v6.d}[1], [x0], x1
+ st1 {v7.d}[1], [x0], x1
+ st1 {v0.d}[1], [x0], x1
+ st1 {v1.d}[1], [x0], x1
+ st1 {v2.d}[1], [x0], x1
+ st1 {v3.d}[1], [x0], x1
+9:
+ ret
+endfunc
+
.macro h264_loop_filter_chroma
dup v22.8B, w2 // alpha
dup v23.8B, w3 // beta
@@ -266,6 +464,105 @@ function ff_h264_h_loop_filter_chroma_neon, export=1
ret
endfunc
+
+.macro h264_loop_filter_chroma_intra
+ uabd v26.8b, v16.8b, v17.8b // abs(p0 - q0)
+ uabd v27.8b, v18.8b, v16.8b // abs(p1 - p0)
+ uabd v28.8b, v19.8b, v17.8b // abs(q1 - q0)
+ cmhi v26.8b, v30.8b, v26.8b // < alpha
+ cmhi v27.8b, v31.8b, v27.8b // < beta
+ cmhi v28.8b, v31.8b, v28.8b // < beta
+ and v26.8b, v26.8b, v27.8b
+ and v26.8b, v26.8b, v28.8b
+ mov x2, v26.d[0]
+
+ ushll v4.8h, v18.8b, #1
+ ushll v6.8h, v19.8b, #1
+ cbz x2, 9f
+ uaddl v20.8h, v16.8b, v19.8b
+ uaddl v22.8h, v17.8b, v18.8b
+ add v20.8h, v20.8h, v4.8h
+ add v22.8h, v22.8h, v6.8h
+ uqrshrn v24.8b, v20.8h, #2
+ uqrshrn v25.8b, v22.8h, #2
+ bit v16.8b, v24.8b, v26.8b
+ bit v17.8b, v25.8b, v26.8b
+.endm
+
+function ff_h264_v_loop_filter_chroma_intra_neon, export=1
+ h264_loop_filter_start_intra
+
+ sub x0, x0, x1, lsl #1
+ ld1 {v18.8b}, [x0], x1
+ ld1 {v16.8b}, [x0], x1
+ ld1 {v17.8b}, [x0], x1
+ ld1 {v19.8b}, [x0]
+
+ h264_loop_filter_chroma_intra
+
+ sub x0, x0, x1, lsl #1
+ st1 {v16.8b}, [x0], x1
+ st1 {v17.8b}, [x0], x1
+
+9:
+ ret
+endfunc
+
+function ff_h264_h_loop_filter_chroma_mbaff_intra_neon, export=1
+ h264_loop_filter_start_intra
+
+ sub x4, x0, #2
+ sub x0, x0, #1
+ ld1 {v18.8b}, [x4], x1
+ ld1 {v16.8b}, [x4], x1
+ ld1 {v17.8b}, [x4], x1
+ ld1 {v19.8b}, [x4], x1
+
+ transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
+
+ h264_loop_filter_chroma_intra
+
+ st2 {v16.b,v17.b}[0], [x0], x1
+ st2 {v16.b,v17.b}[1], [x0], x1
+ st2 {v16.b,v17.b}[2], [x0], x1
+ st2 {v16.b,v17.b}[3], [x0], x1
+
+9:
+ ret
+endfunc
+
+function ff_h264_h_loop_filter_chroma_intra_neon, export=1
+ h264_loop_filter_start_intra
+
+ sub x4, x0, #2
+ sub x0, x0, #1
+ ld1 {v18.8b}, [x4], x1
+ ld1 {v16.8b}, [x4], x1
+ ld1 {v17.8b}, [x4], x1
+ ld1 {v19.8b}, [x4], x1
+ ld1 {v18.s}[1], [x4], x1
+ ld1 {v16.s}[1], [x4], x1
+ ld1 {v17.s}[1], [x4], x1
+ ld1 {v19.s}[1], [x4]
+
+ transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
+
+ h264_loop_filter_chroma_intra
+
+ st2 {v16.b,v17.b}[0], [x0], x1
+ st2 {v16.b,v17.b}[1], [x0], x1
+ st2 {v16.b,v17.b}[2], [x0], x1
+ st2 {v16.b,v17.b}[3], [x0], x1
+ st2 {v16.b,v17.b}[4], [x0], x1
+ st2 {v16.b,v17.b}[5], [x0], x1
+ st2 {v16.b,v17.b}[6], [x0], x1
+ st2 {v16.b,v17.b}[7], [x0], x1
+
+9:
+ ret
+endfunc
+
+
.macro biweight_16 macs, macd
dup v0.16B, w5
dup v1.16B, w6
More information about the ffmpeg-cvslog
mailing list