[FFmpeg-devel] [aarch64] improve performance of ff_hscale_8_to_15_neon

Sun Dec 1 13:10:31 EET 2019

On Wed, Nov 27, 2019 at 12:30:35PM -0600, Sebastian Pop wrote:
[...]
> From 9ecaa99fab4b8bedf3884344774162636eaa5389 Mon Sep 17 00:00:00 2001
> From: Sebastian Pop <spop at amazon.com>
> Date: Sun, 17 Nov 2019 14:13:13 -0600
> Subject: [PATCH] [aarch64] use FMA and increase vector factor to 4
> 
> This patch implements ff_hscale_8_to_15_neon with NEON fused multiply accumulate
> and bumps the vectorization factor from 2 to 4.
> The speedup is of 34% on Graviton A1 instances based on A-72 cpus:
> 
> $ ffmpeg -nostats -f lavfi -i testsrc2=4k:d=2 -vf bench=start,scale=1024x1024,bench=stop -f null -
> before: t:0.040303 avg:0.040287 max:0.040371 min:0.039214
> after:  t:0.030079 avg:0.030102 max:0.030462 min:0.030051
> 
> Tested with `make check` on aarch64-linux.
> ---
>  libswscale/aarch64/hscale.S | 102 ++++++++++++++++++++++++------------
>  1 file changed, 68 insertions(+), 34 deletions(-)
> 
> diff --git a/libswscale/aarch64/hscale.S b/libswscale/aarch64/hscale.S
> index cc78c1901d..12ee7f09e7 100644
> --- a/libswscale/aarch64/hscale.S
> +++ b/libswscale/aarch64/hscale.S
> @@ -21,39 +21,73 @@
>  #include "libavutil/aarch64/asm.S"
>  
>  function ff_hscale_8_to_15_neon, export=1
> -        add                 x10, x4, w6, UXTW #1        // filter2 = filter + filterSize*2 (x2 because int16)
> -1:      ldr                 w8, [x5], #4                // filterPos[0]
> -        ldr                 w9, [x5], #4                // filterPos[1]
> -        movi                v4.4S, #0                   // val sum part 1 (for dst[0])
> -        movi                v5.4S, #0                   // val sum part 2 (for dst[1])
> -        mov                 w7, w6                      // filterSize counter
> -        mov                 x13, x3                     // srcp = src
> -2:      add                 x11, x13, w8, UXTW          // srcp + filterPos[0]
> -        add                 x12, x13, w9, UXTW          // srcp + filterPos[1]
> -        ld1                 {v0.8B}, [x11]              // srcp[filterPos[0] + {0..7}]
> -        ld1                 {v1.8B}, [x12]              // srcp[filterPos[1] + {0..7}]
> -        ld1                 {v2.8H}, [x4],  #16         // load 8x16-bit filter values, part 1
> -        ld1                 {v3.8H}, [x10], #16         // ditto at filter+filterSize for part 2
> -        uxtl                v0.8H, v0.8B                // unpack part 1 to 16-bit
> -        uxtl                v1.8H, v1.8B                // unpack part 2 to 16-bit
> -        smull               v16.4S, v0.4H, v2.4H        // v16.i32{0..3} = part 1 of: srcp[filterPos[0] + {0..7}] * filter[{0..7}]
> -        smull               v18.4S, v1.4H, v3.4H        // v18.i32{0..3} = part 1 of: srcp[filterPos[1] + {0..7}] * filter[{0..7}]
> -        smull2              v17.4S, v0.8H, v2.8H        // v17.i32{0..3} = part 2 of: srcp[filterPos[0] + {0..7}] * filter[{0..7}]
> -        smull2              v19.4S, v1.8H, v3.8H        // v19.i32{0..3} = part 2 of: srcp[filterPos[1] + {0..7}] * filter[{0..7}]
> -        addp                v16.4S, v16.4S, v17.4S      // horizontal pair adding of the 8x32-bit multiplied values for part 1 into 4x32-bit
> -        addp                v18.4S, v18.4S, v19.4S      // horizontal pair adding of the 8x32-bit multiplied values for part 2 into 4x32-bit
> -        add                 v4.4S, v4.4S, v16.4S        // update val accumulator for part 1
> -        add                 v5.4S, v5.4S, v18.4S        // update val accumulator for part 2
> -        add                 x13, x13, #8                // srcp += 8
> -        subs                w7, w7, #8                  // processed 8/filterSize
> -        b.gt                2b                          // inner loop if filterSize not consumed completely
> -        mov                 x4, x10                     // filter = filter2
> -        add                 x10, x10, w6, UXTW #1       // filter2 += filterSize*2
> -        addp                v4.4S, v4.4S, v5.4S         // horizontal pair adding of the 8x32-bit sums into 4x32-bit
> -        addp                v4.4S, v4.4S, v4.4S         // horizontal pair adding of the 4x32-bit sums into 2x32-bit
> -        sqshrn              v4.4H, v4.4S, #7            // shift and clip the 2x16-bit final values
> -        st1                 {v4.S}[0], [x1], #4         // write to destination
> -        subs                w2, w2, #2                  // dstW -= 2
> -        b.gt                1b                          // loop until end of line

> +        sxtw                x9, w6
> +        sbfiz               x12, x6, #1, #32
> +        add                 x14, x12, x9
> +        mov                 x8, xzr
> +        sxtw                x10, w2
> +        sbfiz               x11, x6, #3, #32
> +        sbfiz               x13, x6, #2, #32
> +        lsl                 x14, x14, #1

Did you get this weird dance from the compiler? I think comments would be
welcome here to document the different filterSize * N you're computing.

Also, computing the source pointers directly instead of just the offsets
would simplify things later on. See the original call with the add.

> +1:      lsl                 x17, x8, #2

Can't you increment x8 by the appropriate amount at the end of the loop to
save this shift?

> +        ldrsw               x18, [x5, x17]              // filterPos[0]
> +        orr                 x0, x17, #0x4
> +        orr                 x2, x17, #0x8
> +        orr                 x17, x17, #0xc
> +        ldrsw               x0, [x5, x0]                // filterPos[1]
> +        ldrsw               x2, [x5, x2]                // filterPos[2]
> +        ldrsw               x6, [x5, x17]               // filterPos[3]
> +        mov                 x15, xzr                    // j = 0
> +        mov                 x16, x4                     // filter0 = filter

> +        movi                v0.2d, #0                   // val sum part 1 (for dst[0])
> +        movi                v2.2d, #0                   // val sum part 2 (for dst[1])
> +        movi                v1.2d, #0                   // val sum part 3 (for dst[2])
> +        movi                v3.2d, #0                   // val sum part 4 (for dst[3])

Please use the capitalized form for the 8b, 4h, 2d, etc for consistency
with the original code. It may help the diff in some cases.

> +        add                 x17, x3, x18                // srcp + filterPos[0]
> +        add                 x18, x3, x0                 // srcp + filterPos[1]
> +        add                 x0, x3, x2                  // srcp + filterPos[2]
> +        add                 x2, x3, x6                  // srcp + filterPos[3]

> +2:      ldr                 d4, [x17, x15]              // srcp[filterPos[0] + {0..7}]
> +        ldr                 q5, [x16]                   // load 8x16-bit filter values, part 1
> +        ldr                 d6, [x18, x15]              // srcp[filterPos[1] + {0..7}]
> +        ldr                 q7, [x16, x12]              // load 8x16-bit at filter+filterSize

Why not use ld1 {v4.8B} etc like it was before? The use of Dn/Qn in is
very confusing here.

> +        ushll               v4.8h, v4.8b, #0            // unpack part 1 to 16-bit
> +        smlal               v0.4s, v4.4h, v5.4h         // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}]
> +        smlal2              v0.4s, v4.8h, v5.8h         // v0 accumulates srcp[filterPos[0] + {4..7}] * filter[{4..7}]

> +        ldr                 d4, [x0, x15]               // srcp[filterPos[2] + {0..7}]
> +        ldr                 q5, [x16, x13]              // load 8x16-bit at filter+2*filterSize

ditto ld1 {vN...}

> +        ushll               v6.8h, v6.8b, #0            // unpack part 2 to 16-bit
> +        smlal               v2.4s, v6.4h, v7.4h         // v2 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
> +        ushll               v4.8h, v4.8b, #0            // unpack part 3 to 16-bit
> +        smlal               v1.4s, v4.4h, v5.4h         // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
> +        smlal2              v1.4s, v4.8h, v5.8h         // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]

> +        ldr                 d4, [x2, x15]               // srcp[filterPos[3] + {0..7}]
> +        ldr                 q5, [x16, x14]              // load 8x16-bit at filter+3*filterSize

ditto ld1 {vN...}

> +        add                 x15, x15, #8                // j += 8
> +        smlal2              v2.4s, v6.8h, v7.8h         // v2 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
> +        ushll               v4.8h, v4.8b, #0            // unpack part 4 to 16-bit
> +        smlal               v3.4s, v4.4h, v5.4h         // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]

> +        cmp                 x15, x9                     // j < filterSize

You can save this comparison if you use a decreasing counter with a subS
instruction (look at the original code)

> +        smlal2              v3.4s, v4.8h, v5.8h         // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
> +        add                 x16, x16, #16               // filter0 += 16
> +        b.lt                2b                          // inner loop if filterSize not consumed completely
> +        addp                v0.4s, v0.4s, v0.4s         // part1 horizontal pair adding
> +        addp                v2.4s, v2.4s, v2.4s         // part2 horizontal pair adding
> +        addp                v1.4s, v1.4s, v1.4s         // part3 horizontal pair adding
> +        addp                v3.4s, v3.4s, v3.4s         // part4 horizontal pair adding
> +        addp                v0.4s, v0.4s, v0.4s         // part1 horizontal pair adding
> +        addp                v2.4s, v2.4s, v2.4s         // part2 horizontal pair adding
> +        addp                v1.4s, v1.4s, v1.4s         // part3 horizontal pair adding
> +        addp                v3.4s, v3.4s, v3.4s         // part4 horizontal pair adding
> +        zip1                v0.4s, v0.4s, v2.4s         // part12 = zip values from part1 and part2
> +        zip1                v1.4s, v1.4s, v3.4s         // part34 = zip values from part3 and part4
> +        lsl                 x15, x8, #1
> +        add                 x8, x8, #4                  // i += 4
> +        mov                 v0.d[1], v1.d[0]            // part1234 = zip values from part12 and part34

> +        cmp                 x8, x10                     // i < dstW

ditto subs

> +        sqshrn              v0.4h, v0.4s, #7            // shift and clip the 2x16-bit final values
> +        add                 x4, x4, x11                 // filter += filterSize*4
> +        str                 d0, [x1, x15]               // write to destination part1234
> +        b.lt                1b                          // loop until end of line
>          ret

-- 
Clément B.