[FFmpeg-devel] [PATCH] lavc/aarch64: h264qpel, add lowpass_8 based functions

Fri Sep 3 12:53:41 EEST 2021

On Thu, 19 Aug 2021, Mikhail Nitenko wrote:

> diff --git a/libavcodec/aarch64/h264qpel_neon.S b/libavcodec/aarch64/h264qpel_neon.S
> index d27cfac494..eb18469b7f 100644
> --- a/libavcodec/aarch64/h264qpel_neon.S
> +++ b/libavcodec/aarch64/h264qpel_neon.S
> @@ -932,3 +932,518 @@ endfunc
>
>         h264_qpel16 put
>         h264_qpel16 avg
> +
> +//trashes v0-v5, v7
> +.macro  lowpass_8_10    r0,  r1,  r2,  r3,  d0,  d1
> +        ext             v2.16B,     \r0\().16B,  \r1\().16B, #4
> +        ext             v3.16B,     \r0\().16B,  \r1\().16B, #6
> +        add             v2.8H,      v2.8H,       v3.8H
> +        ext             v4.16B,     \r0\().16B,  \r1\().16B, #2
> +        ext             v5.16B,     \r0\().16B,  \r1\().16B, #8
> +        add             v4.8H,      v4.8H,       v5.8H
> +        ext             v1.16B,     \r0\().16B,  \r1\().16B, #10
> +        uaddl2          \d1\().4S,  \r0\().8H,   v1.8H
> +        uaddl           \d0\().4S,  \r0\().4H,   v1.4H
> +        ext             v0.16B,      \r2\().16B, \r3\().16B, #4

Nit: Indentation is off for the center column

> +        umlal           \d0\().4S,  v2.4H,       v6.H[1]
> +        umlal2          \d1\().4S,  v2.8H,       v6.H[1]
> +        ext             v1.16B,     \r2\().16B, \r3\().16B, #6
> +        add             v0.8H,      v0.8H,       v1.8H
> +        ext             v1.16B,     \r2\().16B,  \r3\().16B, #2
> +        umlsl           \d0\().4S,  v4.4H,       v6.H[0]
> +        umlsl2          \d1\().4S,  v4.8H,       v6.H[0]

I see why you need to go to 32 bit here, but I think this could be kept in 
16 bit with this trick:

First add + mla of the two positive coefficients. This is can go outside 
of the range of a signed 16 bit integer, so this must be considered 
unsigned 16 bit. Then do mul of the negative coefficient (corresponding to 
the umlsl here) into a separate register. We see this as a separate 
unsigned 16 bit value.

Then we so a uqsub of these two 16 bit values; the result is nonnegative, 
but still possibly larger than signed 16 bit range. So then finally you do 
urshr instead of sqrshrun (and maybe also umin instead of smin).

Previously you had:
- 2 uaddl (16->32)
- 2 umlal (16->32)
- 2 umlsl (16->32)
- 2 sqrshrun (32->16)

With this, you'd get this down to:
- 1 add
- 1 mla
- 1 mul
- 1 uqsub
- 1 urshr

So 5 instructions instead of 8.

As there's fewer of each operation, it might be good to interleave it more 
with the second calculation if there's enough registers, to avoid stalling 
in a long sequential operation on one single register.

> +        sqrshrun        \d0\().4H,  \d0\().4S,   #5
> +        sqrshrun2       \d0\().8H,  \d1\().4S,   #5
> +        ext             v3.16B,     \r2\().16B,  \r3\().16B, #8
> +        add             v1.8H,      v1.8H,       v3.8H
> +        ext             v2.16B,     \r2\().16B,  \r3\().16B, #10
> +        uaddl           v3.4S,      \r2\().4H,   v2.4H
> +        uaddl2          v4.4S,      \r2\().8H,   v2.8H
> +        umlal           v3.4S,      v0.4H,       v6.H[1]
> +        umlal2          v4.4S,      v0.8H,       v6.H[1]
> +        umlsl           v3.4S,      v1.4H,       v6.H[0]
> +        umlsl2          v4.4S,      v1.8H,       v6.H[0]
> +        mvni            v5.8h,      #0xFC,       lsl #8 // 1023 for clipping
> +        sqrshrun        \d1\().4H,  v3.4S,       #5
> +        sqrshrun2       \d1\().8H,  v4.4S,       #5
> +        smin            \d0\().8H,  \d0\().8H,   v5.8h
> +        smin            \d1\().8H,  \d1\().8H,   v5.8h
> +.endm
> +
> +function put_h264_qpel16_h_lowpass_neon_packed_10
> +        mov             x4,  x30
> +        mov             x12, #32
> +        mov             x3,  #16
> +        bl              put_h264_qpel8_h_lowpass_neon_10
> +        sub             x1,  x1,  x2, lsl #4
> +        add             x1,  x1,  #16
> +        mov             x12, #32
> +        mov             x30, x4
> +        b               put_h264_qpel8_h_lowpass_neon_10
> +endfunc
> +
> +.macro  h264_qpel_h_lowpass_10 type
> +function \type\()_h264_qpel16_h_lowpass_neon_10
> +        mov             x13, x30
> +        mov             x12, #32
> +        bl              \type\()_h264_qpel8_h_lowpass_neon_10
> +        sub             x0,  x0,  x3, lsl #4
> +        sub             x1,  x1,  x2, lsl #4
> +        add             x0,  x0,  #16
> +        add             x1,  x1,  #16
> +        mov             x12, #32
> +        mov             x30, x13
> +endfunc
> +
> +function \type\()_h264_qpel8_h_lowpass_neon_10
> +1:      ld1             {v28.8H, v29.8H}, [x1], x2
> +        ld1             {v16.8H, v17.8H}, [x1], x2
> +        subs            x12, x12, #4
> +        lowpass_8_10    v28, v29, v16, v17, v28, v20
> +  .ifc \type,avg
> +        ld1             {v2.8H},    [x0], x3
> +        urhadd          v28.8H, v28.8H,  v2.8H
> +        ld1             {v3.8H},    [x0]
> +        urhadd          v20.8H, v20.8H, v3.8H
> +        sub             x0,  x0,  x3
> +  .endif
> +        st1             {v28.8H},    [x0], x3
> +        st1             {v20.8H},    [x0], x3
> +        b.ne            1b
> +        ret
> +endfunc
> +.endm
> +
> +        h264_qpel_h_lowpass_10 put
> +        h264_qpel_h_lowpass_10 avg
> +
> +.macro h264_qpel_h_lowpass_l2_10 type
> +function \type\()_h264_qpel16_h_lowpass_l2_neon_10
> +        mov             x13, x30
> +        mov             x12, #32
> +        bl              \type\()_h264_qpel8_h_lowpass_l2_neon_10
> +        sub             x0,  x0,  x2, lsl #4
> +        sub             x1,  x1,  x2, lsl #4
> +        sub             x3,  x3,  x2, lsl #4
> +        add             x0,  x0,  #16
> +        add             x1,  x1,  #16
> +        add             x3,  x3,  #16
> +        mov             x12, #32
> +        mov             x30, x13
> +endfunc
> +
> +function \type\()_h264_qpel8_h_lowpass_l2_neon_10
> +1:      ld1             {v26.8H, v27.8H}, [x1], x2
> +        ld1             {v16.8H, v17.8H}, [x1], x2
> +        ld1             {v28.8H},     [x3], x2
> +        ld1             {v29.8H},     [x3], x2
> +        subs            x12, x12, #4
> +        lowpass_8_10    v26, v27, v16, v17, v26, v27
> +        urhadd          v26.8H, v26.8H, v28.8H
> +        urhadd          v27.8H, v27.8H, v29.8H
> +  .ifc \type,avg
> +        ld1             {v2.8H},      [x0], x2
> +        urhadd          v26.8H, v26.8H, v2.8H
> +        ld1             {v3.8H},      [x0]
> +        urhadd          v27.8H, v27.8H, v3.8H
> +        sub             x0,  x0,  x2
> +  .endif
> +        st1             {v26.8H},     [x0], x2
> +        st1             {v27.8H},     [x0], x2
> +        b.ne            1b
> +        ret
> +endfunc
> +.endm
> +
> +        h264_qpel_h_lowpass_l2_10 put
> +        h264_qpel_h_lowpass_l2_10 avg
> +
> +function put_h264_qpel16_v_lowpass_neon_packed_10
> +        mov             x4,  x30
> +        mov             x2,  #8
> +        bl              put_h264_qpel8_v_lowpass_neon
> +        sub             x1,  x1,  x3, lsl #2
> +        bl              put_h264_qpel8_v_lowpass_neon
> +        sub             x1,  x1,  x3, lsl #4
> +        sub             x1,  x1,  x3, lsl #2
> +        add             x1,  x1,  #8
> +        bl              put_h264_qpel8_v_lowpass_neon
> +        sub             x1,  x1,  x3, lsl #2
> +        mov             x30, x4
> +        b               put_h264_qpel8_v_lowpass_neon
> +endfunc
> +
> +.macro  h264_qpel_v_lowpass_10 type
> +function \type\()_h264_qpel16_v_lowpass_neon_10
> +        mov             x4,  x30
> +        bl              \type\()_h264_qpel8_v_lowpass_neon_10
> +        sub             x1,  x1,  x3, lsl #2
> +        bl              \type\()_h264_qpel8_v_lowpass_neon_10
> +        sub             x0,  x0,  x2, lsl #4
> +        add             x0,  x0,  #16
> +        sub             x1,  x1,  x3, lsl #4
> +        sub             x1,  x1,  x3, lsl #2
> +        add             x1,  x1,  #16
> +        bl              \type\()_h264_qpel8_v_lowpass_neon_10
> +        sub             x1,  x1,  x3, lsl #2
> +        mov             x30, x4
> +endfunc
> +
> +function \type\()_h264_qpel8_v_lowpass_neon_10
> +        ld1             {v16.8H}, [x1], x3
> +        ld1             {v18.8H}, [x1], x3
> +        ld1             {v20.8H}, [x1], x3
> +        ld1             {v22.8H}, [x1], x3
> +        ld1             {v24.8H}, [x1], x3
> +        ld1             {v26.8H}, [x1], x3
> +        ld1             {v28.8H}, [x1], x3
> +        ld1             {v30.8H}, [x1], x3
> +        ld1             {v17.8H}, [x1], x3
> +        ld1             {v19.8H}, [x1], x3
> +        ld1             {v21.8H}, [x1], x3
> +        ld1             {v23.8H}, [x1], x3
> +        ld1             {v25.8H}, [x1]
> +
> +        transpose_8x8H  v16, v18, v20, v22, v24, v26, v28, v30, v0,  v1
> +        transpose_8x8H  v17, v19, v21, v23, v25, v27, v29, v31, v0,  v1
> +        lowpass_8_10    v16, v17, v18, v19, v16, v17
> +        lowpass_8_10    v20, v21, v22, v23, v18, v19
> +        lowpass_8_10    v24, v25, v26, v27, v20, v21
> +        lowpass_8_10    v28, v29, v30, v31, v22, v23
> +        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1

I'm a bit surprised by doing this kind of vertical filtering by 
transposing and doing it horizontally - when vertical filtering can be 
done so efficiently as-is without needing any extra 'ext' instructions and 
such. But I see that the existing code does it this way. I'll give it a 
try to make a PoC of rewriting the existing code for some case to see how 
it behaves without the transposes.

// Martin