[FFmpeg-devel] [PATCH 2/2] aarch64/vvc: Use faster clip operation
Zhao Zhili
quinkblack at foxmail.com
Tue Dec 10 06:19:02 EET 2024
From: Zhao Zhili <zhilizhao at tencent.com>
Replace sqxtn+smin+smax by sqxtun+umin.
---
libavcodec/aarch64/vvc/inter.S | 41 +++++++++++++++-------------------
1 file changed, 18 insertions(+), 23 deletions(-)
diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S
index b6b079b569..7a752019ee 100644
--- a/libavcodec/aarch64/vvc/inter.S
+++ b/libavcodec/aarch64/vvc/inter.S
@@ -36,13 +36,13 @@
.ifc \type, avg
saddl v4.4s, v0.4h, v2.4h
add v4.4s, v4.4s, v16.4s
- sqshrn v4.4h, v4.4s, #(15 - \bit_depth)
+ sqshrun v4.4h, v4.4s, #(15 - \bit_depth)
.else
mov v4.16b, v16.16b
smlal v4.4s, v0.4h, v19.4h
smlal v4.4s, v2.4h, v20.4h
sqshl v4.4s, v4.4s, v22.4s
- sqxtn v4.4h, v4.4s
+ sqxtun v4.4h, v4.4s
.endif
.if \bit_depth == 8
@@ -54,8 +54,7 @@
.endif
.else // bit_depth > 8
- smin v4.4h, v4.4h, v17.4h
- smax v4.4h, v4.4h, v18.4h
+ umin v4.4h, v4.4h, v17.4h
.if \tap == 2
str s4, [dst]
.else
@@ -95,7 +94,6 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
.if \bit_depth >= 10
// clip pixel
mov w6, #((1 << \bit_depth) - 1)
- movi v18.8h, #0
dup v17.8h, w6
.endif
@@ -121,8 +119,8 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
saddl2 v5.4s, v0.8h, v2.8h
add v4.4s, v4.4s, v16.4s
add v5.4s, v5.4s, v16.4s
- sqshrn v4.4h, v4.4s, #(15 - \bit_depth)
- sqshrn2 v4.8h, v5.4s, #(15 - \bit_depth)
+ sqshrun v4.4h, v4.4s, #(15 - \bit_depth)
+ sqshrun2 v4.8h, v5.4s, #(15 - \bit_depth)
.else
mov v4.16b, v16.16b
mov v5.16b, v16.16b
@@ -132,16 +130,15 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
smlal2 v5.4s, v2.8h, v20.8h
sqshl v4.4s, v4.4s, v22.4s
sqshl v5.4s, v5.4s, v22.4s
- sqxtn v4.4h, v4.4s
- sqxtn2 v4.8h, v5.4s
+ sqxtun v4.4h, v4.4s
+ sqxtun2 v4.8h, v5.4s
.endif
subs height, height, #1
.if \bit_depth == 8
sqxtun v4.8b, v4.8h
st1 {v4.8b}, [dst], dst_stride
.else
- smin v4.8h, v4.8h, v17.8h
- smax v4.8h, v4.8h, v18.8h
+ umin v4.8h, v4.8h, v17.8h
st1 {v4.8h}, [dst], dst_stride
.endif
b.ne 8b
@@ -163,10 +160,10 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
add v5.4s, v5.4s, v16.4s
add v6.4s, v6.4s, v16.4s
add v7.4s, v7.4s, v16.4s
- sqshrn v4.4h, v4.4s, #(15 - \bit_depth)
- sqshrn2 v4.8h, v5.4s, #(15 - \bit_depth)
- sqshrn v6.4h, v6.4s, #(15 - \bit_depth)
- sqshrn2 v6.8h, v7.4s, #(15 - \bit_depth)
+ sqshrun v4.4h, v4.4s, #(15 - \bit_depth)
+ sqshrun2 v4.8h, v5.4s, #(15 - \bit_depth)
+ sqshrun v6.4h, v6.4s, #(15 - \bit_depth)
+ sqshrun2 v6.8h, v7.4s, #(15 - \bit_depth)
.else // avg
mov v4.16b, v16.16b
mov v5.16b, v16.16b
@@ -184,10 +181,10 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
sqshl v5.4s, v5.4s, v22.4s
sqshl v6.4s, v6.4s, v22.4s
sqshl v7.4s, v7.4s, v22.4s
- sqxtn v4.4h, v4.4s
- sqxtn v6.4h, v6.4s
- sqxtn2 v4.8h, v5.4s
- sqxtn2 v6.8h, v7.4s
+ sqxtun v4.4h, v4.4s
+ sqxtun v6.4h, v6.4s
+ sqxtun2 v4.8h, v5.4s
+ sqxtun2 v6.8h, v7.4s
.endif // w_avg
subs w6, w6, #16
.if \bit_depth == 8
@@ -195,10 +192,8 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
sqxtun2 v4.16b, v6.8h
str q4, [x9], #16
.else
- smin v4.8h, v4.8h, v17.8h
- smin v6.8h, v6.8h, v17.8h
- smax v4.8h, v4.8h, v18.8h
- smax v6.8h, v6.8h, v18.8h
+ umin v4.8h, v4.8h, v17.8h
+ umin v6.8h, v6.8h, v17.8h
stp q4, q6, [x9], #32
.endif
b.ne 17b
--
2.46.0
More information about the ffmpeg-devel
mailing list