[FFmpeg-devel] [PATCH] lavc/aarch64: h264qpel, add lowpass_8 based functions
Martin Storsjö
martin at martin.st
Fri Sep 3 12:53:41 EEST 2021
On Thu, 19 Aug 2021, Mikhail Nitenko wrote:
> diff --git a/libavcodec/aarch64/h264qpel_neon.S b/libavcodec/aarch64/h264qpel_neon.S
> index d27cfac494..eb18469b7f 100644
> --- a/libavcodec/aarch64/h264qpel_neon.S
> +++ b/libavcodec/aarch64/h264qpel_neon.S
> @@ -932,3 +932,518 @@ endfunc
>
> h264_qpel16 put
> h264_qpel16 avg
> +
> +//trashes v0-v5, v7
> +.macro lowpass_8_10 r0, r1, r2, r3, d0, d1
> + ext v2.16B, \r0\().16B, \r1\().16B, #4
> + ext v3.16B, \r0\().16B, \r1\().16B, #6
> + add v2.8H, v2.8H, v3.8H
> + ext v4.16B, \r0\().16B, \r1\().16B, #2
> + ext v5.16B, \r0\().16B, \r1\().16B, #8
> + add v4.8H, v4.8H, v5.8H
> + ext v1.16B, \r0\().16B, \r1\().16B, #10
> + uaddl2 \d1\().4S, \r0\().8H, v1.8H
> + uaddl \d0\().4S, \r0\().4H, v1.4H
> + ext v0.16B, \r2\().16B, \r3\().16B, #4
Nit: Indentation is off for the center column
> + umlal \d0\().4S, v2.4H, v6.H[1]
> + umlal2 \d1\().4S, v2.8H, v6.H[1]
> + ext v1.16B, \r2\().16B, \r3\().16B, #6
> + add v0.8H, v0.8H, v1.8H
> + ext v1.16B, \r2\().16B, \r3\().16B, #2
> + umlsl \d0\().4S, v4.4H, v6.H[0]
> + umlsl2 \d1\().4S, v4.8H, v6.H[0]
I see why you need to go to 32 bit here, but I think this could be kept in
16 bit with this trick:
First add + mla of the two positive coefficients. This is can go outside
of the range of a signed 16 bit integer, so this must be considered
unsigned 16 bit. Then do mul of the negative coefficient (corresponding to
the umlsl here) into a separate register. We see this as a separate
unsigned 16 bit value.
Then we so a uqsub of these two 16 bit values; the result is nonnegative,
but still possibly larger than signed 16 bit range. So then finally you do
urshr instead of sqrshrun (and maybe also umin instead of smin).
Previously you had:
- 2 uaddl (16->32)
- 2 umlal (16->32)
- 2 umlsl (16->32)
- 2 sqrshrun (32->16)
With this, you'd get this down to:
- 1 add
- 1 mla
- 1 mul
- 1 uqsub
- 1 urshr
So 5 instructions instead of 8.
As there's fewer of each operation, it might be good to interleave it more
with the second calculation if there's enough registers, to avoid stalling
in a long sequential operation on one single register.
> + sqrshrun \d0\().4H, \d0\().4S, #5
> + sqrshrun2 \d0\().8H, \d1\().4S, #5
> + ext v3.16B, \r2\().16B, \r3\().16B, #8
> + add v1.8H, v1.8H, v3.8H
> + ext v2.16B, \r2\().16B, \r3\().16B, #10
> + uaddl v3.4S, \r2\().4H, v2.4H
> + uaddl2 v4.4S, \r2\().8H, v2.8H
> + umlal v3.4S, v0.4H, v6.H[1]
> + umlal2 v4.4S, v0.8H, v6.H[1]
> + umlsl v3.4S, v1.4H, v6.H[0]
> + umlsl2 v4.4S, v1.8H, v6.H[0]
> + mvni v5.8h, #0xFC, lsl #8 // 1023 for clipping
> + sqrshrun \d1\().4H, v3.4S, #5
> + sqrshrun2 \d1\().8H, v4.4S, #5
> + smin \d0\().8H, \d0\().8H, v5.8h
> + smin \d1\().8H, \d1\().8H, v5.8h
> +.endm
> +
> +function put_h264_qpel16_h_lowpass_neon_packed_10
> + mov x4, x30
> + mov x12, #32
> + mov x3, #16
> + bl put_h264_qpel8_h_lowpass_neon_10
> + sub x1, x1, x2, lsl #4
> + add x1, x1, #16
> + mov x12, #32
> + mov x30, x4
> + b put_h264_qpel8_h_lowpass_neon_10
> +endfunc
> +
> +.macro h264_qpel_h_lowpass_10 type
> +function \type\()_h264_qpel16_h_lowpass_neon_10
> + mov x13, x30
> + mov x12, #32
> + bl \type\()_h264_qpel8_h_lowpass_neon_10
> + sub x0, x0, x3, lsl #4
> + sub x1, x1, x2, lsl #4
> + add x0, x0, #16
> + add x1, x1, #16
> + mov x12, #32
> + mov x30, x13
> +endfunc
> +
> +function \type\()_h264_qpel8_h_lowpass_neon_10
> +1: ld1 {v28.8H, v29.8H}, [x1], x2
> + ld1 {v16.8H, v17.8H}, [x1], x2
> + subs x12, x12, #4
> + lowpass_8_10 v28, v29, v16, v17, v28, v20
> + .ifc \type,avg
> + ld1 {v2.8H}, [x0], x3
> + urhadd v28.8H, v28.8H, v2.8H
> + ld1 {v3.8H}, [x0]
> + urhadd v20.8H, v20.8H, v3.8H
> + sub x0, x0, x3
> + .endif
> + st1 {v28.8H}, [x0], x3
> + st1 {v20.8H}, [x0], x3
> + b.ne 1b
> + ret
> +endfunc
> +.endm
> +
> + h264_qpel_h_lowpass_10 put
> + h264_qpel_h_lowpass_10 avg
> +
> +.macro h264_qpel_h_lowpass_l2_10 type
> +function \type\()_h264_qpel16_h_lowpass_l2_neon_10
> + mov x13, x30
> + mov x12, #32
> + bl \type\()_h264_qpel8_h_lowpass_l2_neon_10
> + sub x0, x0, x2, lsl #4
> + sub x1, x1, x2, lsl #4
> + sub x3, x3, x2, lsl #4
> + add x0, x0, #16
> + add x1, x1, #16
> + add x3, x3, #16
> + mov x12, #32
> + mov x30, x13
> +endfunc
> +
> +function \type\()_h264_qpel8_h_lowpass_l2_neon_10
> +1: ld1 {v26.8H, v27.8H}, [x1], x2
> + ld1 {v16.8H, v17.8H}, [x1], x2
> + ld1 {v28.8H}, [x3], x2
> + ld1 {v29.8H}, [x3], x2
> + subs x12, x12, #4
> + lowpass_8_10 v26, v27, v16, v17, v26, v27
> + urhadd v26.8H, v26.8H, v28.8H
> + urhadd v27.8H, v27.8H, v29.8H
> + .ifc \type,avg
> + ld1 {v2.8H}, [x0], x2
> + urhadd v26.8H, v26.8H, v2.8H
> + ld1 {v3.8H}, [x0]
> + urhadd v27.8H, v27.8H, v3.8H
> + sub x0, x0, x2
> + .endif
> + st1 {v26.8H}, [x0], x2
> + st1 {v27.8H}, [x0], x2
> + b.ne 1b
> + ret
> +endfunc
> +.endm
> +
> + h264_qpel_h_lowpass_l2_10 put
> + h264_qpel_h_lowpass_l2_10 avg
> +
> +function put_h264_qpel16_v_lowpass_neon_packed_10
> + mov x4, x30
> + mov x2, #8
> + bl put_h264_qpel8_v_lowpass_neon
> + sub x1, x1, x3, lsl #2
> + bl put_h264_qpel8_v_lowpass_neon
> + sub x1, x1, x3, lsl #4
> + sub x1, x1, x3, lsl #2
> + add x1, x1, #8
> + bl put_h264_qpel8_v_lowpass_neon
> + sub x1, x1, x3, lsl #2
> + mov x30, x4
> + b put_h264_qpel8_v_lowpass_neon
> +endfunc
> +
> +.macro h264_qpel_v_lowpass_10 type
> +function \type\()_h264_qpel16_v_lowpass_neon_10
> + mov x4, x30
> + bl \type\()_h264_qpel8_v_lowpass_neon_10
> + sub x1, x1, x3, lsl #2
> + bl \type\()_h264_qpel8_v_lowpass_neon_10
> + sub x0, x0, x2, lsl #4
> + add x0, x0, #16
> + sub x1, x1, x3, lsl #4
> + sub x1, x1, x3, lsl #2
> + add x1, x1, #16
> + bl \type\()_h264_qpel8_v_lowpass_neon_10
> + sub x1, x1, x3, lsl #2
> + mov x30, x4
> +endfunc
> +
> +function \type\()_h264_qpel8_v_lowpass_neon_10
> + ld1 {v16.8H}, [x1], x3
> + ld1 {v18.8H}, [x1], x3
> + ld1 {v20.8H}, [x1], x3
> + ld1 {v22.8H}, [x1], x3
> + ld1 {v24.8H}, [x1], x3
> + ld1 {v26.8H}, [x1], x3
> + ld1 {v28.8H}, [x1], x3
> + ld1 {v30.8H}, [x1], x3
> + ld1 {v17.8H}, [x1], x3
> + ld1 {v19.8H}, [x1], x3
> + ld1 {v21.8H}, [x1], x3
> + ld1 {v23.8H}, [x1], x3
> + ld1 {v25.8H}, [x1]
> +
> + transpose_8x8H v16, v18, v20, v22, v24, v26, v28, v30, v0, v1
> + transpose_8x8H v17, v19, v21, v23, v25, v27, v29, v31, v0, v1
> + lowpass_8_10 v16, v17, v18, v19, v16, v17
> + lowpass_8_10 v20, v21, v22, v23, v18, v19
> + lowpass_8_10 v24, v25, v26, v27, v20, v21
> + lowpass_8_10 v28, v29, v30, v31, v22, v23
> + transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
I'm a bit surprised by doing this kind of vertical filtering by
transposing and doing it horizontally - when vertical filtering can be
done so efficiently as-is without needing any extra 'ext' instructions and
such. But I see that the existing code does it this way. I'll give it a
try to make a PoC of rewriting the existing code for some case to see how
it behaves without the transposes.
// Martin
More information about the ffmpeg-devel
mailing list