[FFmpeg-cvslog] aarch64: hevc: Merge consecutive stores in put_hevc_\type\()_h16_8_neon
Martin Storsjö
git at videolan.org
Tue Mar 26 09:06:22 EET 2024
ffmpeg | branch: master | Martin Storsjö <martin at martin.st> | Fri Mar 22 11:38:15 2024 +0200| [e3a54cabde5ea14a16e702cec8bf177a4c214962] | committer: Martin Storsjö
aarch64: hevc: Merge consecutive stores in put_hevc_\type\()_h16_8_neon
This gets rid of a couple instructions, but the actual performance
is almost identical on Cortex A72/A73. On Cortex A53, it is a
handful of cycles faster.
Signed-off-by: Martin Storsjö <martin at martin.st>
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=e3a54cabde5ea14a16e702cec8bf177a4c214962
---
libavcodec/aarch64/hevcdsp_qpel_neon.S | 15 +++++----------
1 file changed, 5 insertions(+), 10 deletions(-)
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S
index 815d897094..432558bb95 100644
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -512,11 +512,10 @@ function ff_hevc_put_hevc_\type\()_h16_8_neon, export=1
.ifc \type, qpel
mov dststride, #(MAX_PB_SIZE << 1)
lsl x13, srcstride, #1 // srcstridel
- mov x14, #((MAX_PB_SIZE << 2) - 16)
+ mov x14, #(MAX_PB_SIZE << 2)
.else
lsl x14, dststride, #1 // dststridel
lsl x13, srcstride, #1 // srcstridel
- sub x14, x14, #8
.endif
add x10, dst, dststride // dstb
add x12, src, srcstride // srcb
@@ -527,10 +526,8 @@ function ff_hevc_put_hevc_\type\()_h16_8_neon, export=1
bl ff_hevc_put_hevc_h16_8_neon
.ifc \type, qpel
- st1 {v26.8h}, [dst], #16
- st1 {v28.8h}, [x10], #16
- st1 {v27.8h}, [dst], x14
- st1 {v29.8h}, [x10], x14
+ st1 {v26.8h, v27.8h}, [dst], x14
+ st1 {v28.8h, v29.8h}, [x10], x14
.else
.ifc \type, qpel_bi
ld1 {v16.8h, v17.8h}, [ x4], x16
@@ -549,10 +546,8 @@ function ff_hevc_put_hevc_\type\()_h16_8_neon, export=1
sqrshrun v28.8b, v28.8h, #6
sqrshrun v29.8b, v29.8h, #6
.endif
- st1 {v26.8b}, [dst], #8
- st1 {v28.8b}, [x10], #8
- st1 {v27.8b}, [dst], x14
- st1 {v29.8b}, [x10], x14
+ st1 {v26.8b, v27.8b}, [dst], x14
+ st1 {v28.8b, v29.8b}, [x10], x14
.endif
b.gt 1b // double line
subs width, width, #16
More information about the ffmpeg-cvslog
mailing list