[FFmpeg-cvslog] aarch64: h264pred: Optimize the inner loop of existing 8 bit functions
Martin Storsjö
git at videolan.org
Wed Apr 14 15:23:53 EEST 2021
ffmpeg | branch: master | Martin Storsjö <martin at martin.st> | Mon Apr 12 10:31:22 2021 +0300| [870bfe16a12bf09dca3a4ae27ef6f81a2de80c40] | committer: Martin Storsjö
aarch64: h264pred: Optimize the inner loop of existing 8 bit functions
Move the loop counter decrement further from the branch instruction,
this hides the latency of the decrement.
In loops that first load, then store (the horizontal prediction cases),
do the decrement after the load (where the next instruction would
stall a bit anyway, waiting for the result of the load).
In loops that store twice using the same destination register,
also do the decrement between the two stores (as the second store
would need to wait for the updated destination register from the
first instruction).
In loops that store twice to two different destination registers,
do the decrement before both stores, to do it as soon before the
branch as possible.
This gives minor (1-2 cycle) speedups in most cases (modulo measurement
noise), but the horizontal prediction functions get a rather notable
speedup on the Cortex A53.
Before: Cortex A53 A72 A73
pred8x8_dc_8_neon: 60.7 46.2 39.2
pred8x8_dc_128_8_neon: 30.7 18.0 14.0
pred8x8_horizontal_8_neon: 42.2 29.2 18.5
pred8x8_left_dc_8_neon: 52.7 36.2 32.2
pred8x8_mad_cow_dc_0l0_8_neon: 48.2 27.7 25.7
pred8x8_mad_cow_dc_0lt_8_neon: 52.5 33.2 34.7
pred8x8_mad_cow_dc_l0t_8_neon: 52.5 31.7 33.2
pred8x8_mad_cow_dc_l00_8_neon: 43.2 27.0 25.5
pred8x8_plane_8_neon: 112.2 86.2 88.2
pred8x8_top_dc_8_neon: 40.7 23.0 21.2
pred8x8_vertical_8_neon: 27.2 15.5 14.0
pred16x16_dc_8_neon: 91.0 73.2 70.5
pred16x16_dc_128_8_neon: 43.0 34.7 30.7
pred16x16_horizontal_8_neon: 86.0 49.7 44.7
pred16x16_left_dc_8_neon: 87.0 67.2 67.5
pred16x16_plane_8_neon: 236.0 175.7 173.0
pred16x16_top_dc_8_neon: 53.2 39.0 41.7
pred16x16_vertical_8_neon: 41.7 29.7 31.0
After:
pred8x8_dc_8_neon: 59.0 46.7 42.5
pred8x8_dc_128_8_neon: 28.2 18.0 14.0
pred8x8_horizontal_8_neon: 34.2 29.2 18.5
pred8x8_left_dc_8_neon: 51.0 38.2 32.7
pred8x8_mad_cow_dc_0l0_8_neon: 46.7 28.2 26.2
pred8x8_mad_cow_dc_0lt_8_neon: 55.2 33.7 37.5
pred8x8_mad_cow_dc_l0t_8_neon: 51.2 31.7 37.2
pred8x8_mad_cow_dc_l00_8_neon: 41.7 27.5 26.0
pred8x8_plane_8_neon: 111.5 86.5 89.5
pred8x8_top_dc_8_neon: 39.0 23.2 21.0
pred8x8_vertical_8_neon: 27.2 16.0 14.0
pred16x16_dc_8_neon: 85.0 70.2 70.5
pred16x16_dc_128_8_neon: 42.0 30.0 30.7
pred16x16_horizontal_8_neon: 66.5 49.5 42.5
pred16x16_left_dc_8_neon: 81.0 66.5 67.5
pred16x16_plane_8_neon: 235.0 175.7 173.0
pred16x16_top_dc_8_neon: 52.0 39.0 41.7
pred16x16_vertical_8_neon: 40.2 33.2 31.0
Despite this, a number of these functions still are slower than
what e.g. GCC 7 generates - this shows the relative speedup of the
neon codepaths over the compiler generated ones:
Cortex A53 A72 A73
pred8x8_dc_8_neon: 0.86 0.65 1.04
pred8x8_dc_128_8_neon: 0.59 0.44 0.62
pred8x8_horizontal_8_neon: 1.51 0.58 1.30
pred8x8_left_dc_8_neon: 0.72 0.56 0.89
pred8x8_mad_cow_dc_0l0_8_neon: 0.93 0.93 1.37
pred8x8_mad_cow_dc_0lt_8_neon: 1.37 1.41 1.68
pred8x8_mad_cow_dc_l0t_8_neon: 1.21 1.17 1.32
pred8x8_mad_cow_dc_l00_8_neon: 1.24 1.19 1.60
pred8x8_plane_8_neon: 3.36 3.58 3.76
pred8x8_top_dc_8_neon: 0.97 0.99 1.43
pred8x8_vertical_8_neon: 0.86 0.78 1.18
pred16x16_dc_8_neon: 1.20 1.06 1.49
pred16x16_dc_128_8_neon: 0.83 0.95 0.99
pred16x16_horizontal_8_neon: 1.78 0.96 1.59
pred16x16_left_dc_8_neon: 1.06 0.96 1.32
pred16x16_plane_8_neon: 5.78 6.49 7.19
pred16x16_top_dc_8_neon: 1.48 1.53 1.94
pred16x16_vertical_8_neon: 1.39 1.34 1.98
In particular, on Cortex A72, many of these functions are slower
than the compiler generated code, while they're more beneficial on
e.g. the Cortex A73.
Signed-off-by: Martin Storsjö <martin at martin.st>
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=870bfe16a12bf09dca3a4ae27ef6f81a2de80c40
---
libavcodec/aarch64/h264pred_neon.S | 22 +++++++++++-----------
1 file changed, 11 insertions(+), 11 deletions(-)
diff --git a/libavcodec/aarch64/h264pred_neon.S b/libavcodec/aarch64/h264pred_neon.S
index 213b40b3e7..6fec33cf6a 100644
--- a/libavcodec/aarch64/h264pred_neon.S
+++ b/libavcodec/aarch64/h264pred_neon.S
@@ -81,8 +81,8 @@ function ff_pred16x16_dc_neon, export=1
.L_pred16x16_dc_end:
mov w3, #8
6: st1 {v0.16b}, [x0], x1
- st1 {v0.16b}, [x0], x1
subs w3, w3, #1
+ st1 {v0.16b}, [x0], x1
b.ne 6b
ret
endfunc
@@ -91,8 +91,8 @@ function ff_pred16x16_hor_neon, export=1
sub x2, x0, #1
mov w3, #16
1: ld1r {v0.16b}, [x2], x1
- st1 {v0.16b}, [x0], x1
subs w3, w3, #1
+ st1 {v0.16b}, [x0], x1
b.ne 1b
ret
endfunc
@@ -102,9 +102,9 @@ function ff_pred16x16_vert_neon, export=1
add x1, x1, x1
ld1 {v0.16b}, [x2], x1
mov w3, #8
-1: st1 {v0.16b}, [x0], x1
+1: subs w3, w3, #1
+ st1 {v0.16b}, [x0], x1
st1 {v0.16b}, [x2], x1
- subs w3, w3, #1
b.ne 1b
ret
endfunc
@@ -158,8 +158,8 @@ function ff_pred16x16_plane_neon, export=1
add v1.8h, v1.8h, v2.8h
sqshrun2 v0.16b, v1.8h, #5
add v1.8h, v1.8h, v3.8h
- st1 {v0.16b}, [x0], x1
subs w3, w3, #1
+ st1 {v0.16b}, [x0], x1
b.ne 1b
ret
endfunc
@@ -175,8 +175,8 @@ function ff_pred8x8_hor_neon, export=1
sub x2, x0, #1
mov w3, #8
1: ld1r {v0.8b}, [x2], x1
- st1 {v0.8b}, [x0], x1
subs w3, w3, #1
+ st1 {v0.8b}, [x0], x1
b.ne 1b
ret
endfunc
@@ -186,9 +186,9 @@ function ff_pred8x8_vert_neon, export=1
lsl x1, x1, #1
ld1 {v0.8b}, [x2], x1
mov w3, #4
-1: st1 {v0.8b}, [x0], x1
+1: subs w3, w3, #1
+ st1 {v0.8b}, [x0], x1
st1 {v0.8b}, [x2], x1
- subs w3, w3, #1
b.ne 1b
ret
endfunc
@@ -232,9 +232,9 @@ function ff_pred8x8_plane_neon, export=1
mov w3, #8
1:
sqshrun v0.8b, v1.8h, #5
+ subs w3, w3, #1
add v1.8h, v1.8h, v2.8h
st1 {v0.8b}, [x0], x1
- subs w3, w3, #1
b.ne 1b
ret
endfunc
@@ -290,9 +290,9 @@ function ff_pred8x8_dc_neon, export=1
.L_pred8x8_dc_end:
mov w3, #4
add x2, x0, x1, lsl #2
-6: st1 {v0.8b}, [x0], x1
+6: subs w3, w3, #1
+ st1 {v0.8b}, [x0], x1
st1 {v1.8b}, [x2], x1
- subs w3, w3, #1
b.ne 6b
ret
endfunc
More information about the ffmpeg-cvslog
mailing list