[FFmpeg-devel] [PATCH v2] avcodec/aarch64/vvc: Optimize vvc_avg{8, 10, 12}

Sun Mar 9 15:43:14 EET 2025

On Fri, Mar 7, 2025 at 9:56 PM Martin Storsjö <martin at martin.st> wrote:

> On Mon, 3 Mar 2025, Krzysztof Pyrkosz via ffmpeg-devel wrote:
>
> > This patch replaces integer widening with halving addition, and
> > multi-step "emulated" rounding shift with a single asm instruction doing
> > exactly that.
> >
> > Benchmarks before and after:
> > A78
> > avg_8_64x64_neon:                                     2686.2 ( 6.12x)
> > avg_8_128x128_neon:                                  10734.2 ( 5.88x)
> > avg_10_64x64_neon:                                    2536.8 ( 5.40x)
> > avg_10_128x128_neon:                                 10079.0 ( 5.22x)
> > avg_12_64x64_neon:                                    2548.2 ( 5.38x)
> > avg_12_128x128_neon:                                 10133.8 ( 5.19x)
> >
> > avg_8_64x64_neon:                                      897.8 (18.26x)
> > avg_8_128x128_neon:                                   3608.5 (17.37x)
> > avg_10_32x32_neon:                                     444.2 ( 8.51x)
> > avg_10_64x64_neon:                                    1711.8 ( 8.00x)
> > avg_12_64x64_neon:                                    1706.2 ( 8.02x)
> > avg_12_128x128_neon:                                  7010.0 ( 7.46x)
> >
> > A72
> > avg_8_64x64_neon:                                     5823.4 ( 3.88x)
> > avg_8_128x128_neon:                                  17430.5 ( 4.73x)
> > avg_10_64x64_neon:                                    5228.1 ( 3.71x)
> > avg_10_128x128_neon:                                 16722.2 ( 4.17x)
> > avg_12_64x64_neon:                                    5379.1 ( 3.51x)
> > avg_12_128x128_neon:                                 16715.7 ( 4.17x)
> >
> > avg_8_64x64_neon:                                     2006.5 (10.61x)
> > avg_8_128x128_neon:                                   9158.7 ( 8.96x)
> > avg_10_64x64_neon:                                    3357.7 ( 5.60x)
> > avg_10_128x128_neon:                                 12411.7 ( 5.56x)
> > avg_12_64x64_neon:                                    3317.5 ( 5.67x)
> > avg_12_128x128_neon:                                 12358.5 ( 5.58x)
> >
> > A53
> > avg_8_64x64_neon:                                     8327.8 ( 5.18x)
> > avg_8_128x128_neon:                                  31631.3 ( 5.34x)
> > avg_10_64x64_neon:                                    8783.5 ( 4.98x)
> > avg_10_128x128_neon:                                 32617.0 ( 5.25x)
> > avg_12_64x64_neon:                                    8686.0 ( 5.06x)
> > avg_12_128x128_neon:                                 32487.5 ( 5.25x)
> >
> > avg_8_64x64_neon:                                     6032.3 ( 7.17x)
> > avg_8_128x128_neon:                                  22008.5 ( 7.69x)
> > avg_10_64x64_neon:                                    7738.0 ( 5.68x)
> > avg_10_128x128_neon:                                 27813.8 ( 6.14x)
> > avg_12_64x64_neon:                                    7844.5 ( 5.60x)
> > avg_12_128x128_neon:                                 26999.5 ( 6.34x)
> > ---
> > libavcodec/aarch64/vvc/inter.S | 177 ++++++++++++++++++++++++---------
> > 1 file changed, 130 insertions(+), 47 deletions(-)
> >
> > diff --git a/libavcodec/aarch64/vvc/inter.S
> b/libavcodec/aarch64/vvc/inter.S
> > index 0edc861f97..b2f44697d3 100644
> > --- a/libavcodec/aarch64/vvc/inter.S
> > +++ b/libavcodec/aarch64/vvc/inter.S
> > @@ -24,9 +24,9 @@
> > #define BDOF_BLOCK_SIZE 16
> > #define BDOF_MIN_BLOCK_SIZE 4
> >
> > -.macro vvc_avg type, bit_depth
> > +.macro vvc_avg bit_depth
> >
> > -.macro vvc_\type\()_\bit_depth\()_2_4 tap
> > +.macro vvc_w_avg_\bit_depth\()_2_4 tap
> > .if \tap == 2
> >         ldr             s0, [src0]
> >         ldr             s2, [src1]
> > @@ -34,18 +34,11 @@
> >         ldr             d0, [src0]
> >         ldr             d2, [src1]
> > .endif
> > -
> > -.ifc \type, avg
> > -        saddl           v4.4s, v0.4h, v2.4h
> > -        add             v4.4s, v4.4s, v16.4s
> > -        sqshrun         v4.4h, v4.4s, #(15 - \bit_depth)
> > -.else
> >         mov             v4.16b, v16.16b
> >         smlal           v4.4s, v0.4h, v19.4h
> >         smlal           v4.4s, v2.4h, v20.4h
> >         sqshl           v4.4s, v4.4s, v22.4s
> >         sqxtun          v4.4h, v4.4s
> > -.endif
> >
> > .if \bit_depth == 8
> >         sqxtun          v4.8b, v4.8h
> > @@ -68,7 +61,7 @@
> >         add             dst, dst, dst_stride
> > .endm
> >
> > -function ff_vvc_\type\()_\bit_depth\()_neon, export=1
> > +function ff_vvc_w_avg_\bit_depth\()_neon, export=1
> >         dst             .req x0
> >         dst_stride      .req x1
> >         src0            .req x2
> > @@ -78,9 +71,6 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
> >
> >         mov             x10, #(VVC_MAX_PB_SIZE * 2)
> >         cmp             width, #8
> > -.ifc \type, avg
> > -        movi            v16.4s, #(1 << (14 - \bit_depth))
> > -.else
> >         lsr             x11, x6, #32        // weight0
> >         mov             w12, w6             // weight1
> >         lsr             x13, x7, #32        // offset
> > @@ -91,9 +81,8 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
> >         dup             v20.8h, w12
> >         dup             v16.4s, w13
> >         dup             v22.4s, w14
> > -.endif // avg
> >
> > - .if \bit_depth >= 10
> > +.if \bit_depth >= 10
> >         // clip pixel
> >         mov             w6, #((1 << \bit_depth) - 1)
> >         dup             v17.8h, w6
> > @@ -105,25 +94,17 @@ function ff_vvc_\type\()_\bit_depth\()_neon,
> export=1
> >         b.eq            4f
> > 2:      // width == 2
> >         subs            height, height, #1
> > -        vvc_\type\()_\bit_depth\()_2_4 2
> > +        vvc_w_avg_\bit_depth\()_2_4 2
> >         b.ne            2b
> >         b               32f
> > 4:      // width == 4
> >         subs            height, height, #1
> > -        vvc_\type\()_\bit_depth\()_2_4 4
> > +        vvc_w_avg_\bit_depth\()_2_4 4
> >         b.ne            4b
> >         b               32f
> > 8:      // width == 8
> >         ld1             {v0.8h}, [src0], x10
> >         ld1             {v2.8h}, [src1], x10
> > -.ifc \type, avg
> > -        saddl           v4.4s, v0.4h, v2.4h
> > -        saddl2          v5.4s, v0.8h, v2.8h
> > -        add             v4.4s, v4.4s, v16.4s
> > -        add             v5.4s, v5.4s, v16.4s
> > -        sqshrun         v4.4h, v4.4s, #(15 - \bit_depth)
> > -        sqshrun2        v4.8h, v5.4s, #(15 - \bit_depth)
> > -.else
> >         mov             v4.16b, v16.16b
> >         mov             v5.16b, v16.16b
> >         smlal           v4.4s, v0.4h, v19.4h
> > @@ -134,7 +115,6 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
> >         sqshl           v5.4s, v5.4s, v22.4s
> >         sqxtun          v4.4h, v4.4s
> >         sqxtun2         v4.8h, v5.4s
> > -.endif
> >         subs            height, height, #1
> > .if \bit_depth == 8
> >         sqxtun          v4.8b, v4.8h
> > @@ -153,20 +133,6 @@ function ff_vvc_\type\()_\bit_depth\()_neon,
> export=1
> > 17:
> >         ldp             q0, q1, [x7], #32
> >         ldp             q2, q3, [x8], #32
> > -.ifc \type, avg
> > -        saddl           v4.4s, v0.4h, v2.4h
> > -        saddl2          v5.4s, v0.8h, v2.8h
> > -        saddl           v6.4s, v1.4h, v3.4h
> > -        saddl2          v7.4s, v1.8h, v3.8h
> > -        add             v4.4s, v4.4s, v16.4s
> > -        add             v5.4s, v5.4s, v16.4s
> > -        add             v6.4s, v6.4s, v16.4s
> > -        add             v7.4s, v7.4s, v16.4s
> > -        sqshrun         v4.4h, v4.4s, #(15 - \bit_depth)
> > -        sqshrun2        v4.8h, v5.4s, #(15 - \bit_depth)
> > -        sqshrun         v6.4h, v6.4s, #(15 - \bit_depth)
> > -        sqshrun2        v6.8h, v7.4s, #(15 - \bit_depth)
> > -.else   // avg
> >         mov             v4.16b, v16.16b
> >         mov             v5.16b, v16.16b
> >         mov             v6.16b, v16.16b
> > @@ -187,7 +153,6 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
> >         sqxtun          v6.4h, v6.4s
> >         sqxtun2         v4.8h, v5.4s
> >         sqxtun2         v6.8h, v7.4s
> > -.endif  // w_avg
> >         subs            w6, w6, #16
> > .if \bit_depth == 8
> >         sqxtun          v4.8b, v4.8h
> > @@ -217,12 +182,130 @@ function ff_vvc_\type\()_\bit_depth\()_neon,
> export=1
> > endfunc
> > .endm
> >
> > -vvc_avg avg, 8
> > -vvc_avg avg, 10
> > -vvc_avg avg, 12
> > -vvc_avg w_avg, 8
> > -vvc_avg w_avg, 10
> > -vvc_avg w_avg, 12
> > +vvc_avg 8
> > +vvc_avg 10
> > +vvc_avg 12
> > +
> > +.macro vvc_avg2 bit_depth
>
> Instead of naming this vvc_avg2, and the old one (which only produces the
> w_avg function now) vvc_avg, we could rename the old one to vvc_w_avg, and
> the new one to plain vvc_avg.
>
> I did that change and pushed this patch now, thanks!
>

Great!
Thank you, Krzystof, Zhili and Martin

>
> // Martin
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request at ffmpeg.org with subject "unsubscribe".
>