[FFmpeg-cvslog] lavc/aarch64: move transpose_4x8H to neon.S

Sat Aug 21 00:11:27 EEST 2021

ffmpeg | branch: master | Mikhail Nitenko <mnitenko at gmail.com> | Fri Aug 20 00:07:59 2021 +0300| [756d2e087a73c1e1ebe9647e085a1f5c90fa87de] | committer: Martin Storsjö

lavc/aarch64: move transpose_4x8H to neon.S

transpose_4x8H was declared in vp9lpf_16bpp_neon, however this macro is
not unique to vp9 and could be used elsewhere.

Signed-off-by: Mikhail Nitenko <mnitenko at gmail.com>
Signed-off-by: Martin Storsjö <martin at martin.st>

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=756d2e087a73c1e1ebe9647e085a1f5c90fa87de
---

 libavcodec/aarch64/neon.S              | 13 +++++++++++++
 libavcodec/aarch64/vp9lpf_16bpp_neon.S | 12 ------------
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/libavcodec/aarch64/neon.S b/libavcodec/aarch64/neon.S
index 0fddbecae3..1ad32c359d 100644
--- a/libavcodec/aarch64/neon.S
+++ b/libavcodec/aarch64/neon.S
@@ -109,12 +109,25 @@
         trn2            \r5\().4H,  \r0\().4H,  \r1\().4H
         trn1            \r6\().4H,  \r2\().4H,  \r3\().4H
         trn2            \r7\().4H,  \r2\().4H,  \r3\().4H
+
         trn1            \r0\().2S,  \r4\().2S,  \r6\().2S
         trn2            \r2\().2S,  \r4\().2S,  \r6\().2S
         trn1            \r1\().2S,  \r5\().2S,  \r7\().2S
         trn2            \r3\().2S,  \r5\().2S,  \r7\().2S
 .endm
 
+.macro transpose_4x8H r0, r1, r2, r3, t4, t5, t6, t7
+        trn1            \t4\().8H,  \r0\().8H,  \r1\().8H
+        trn2            \t5\().8H,  \r0\().8H,  \r1\().8H
+        trn1            \t6\().8H,  \r2\().8H,  \r3\().8H
+        trn2            \t7\().8H,  \r2\().8H,  \r3\().8H
+
+        trn1            \r0\().4S,  \t4\().4S,  \t6\().4S
+        trn2            \r2\().4S,  \t4\().4S,  \t6\().4S
+        trn1            \r1\().4S,  \t5\().4S,  \t7\().4S
+        trn2            \r3\().4S,  \t5\().4S,  \t7\().4S
+.endm
+
 .macro  transpose_8x8H  r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
         trn1            \r8\().8H,  \r0\().8H,  \r1\().8H
         trn2            \r9\().8H,  \r0\().8H,  \r1\().8H
diff --git a/libavcodec/aarch64/vp9lpf_16bpp_neon.S b/libavcodec/aarch64/vp9lpf_16bpp_neon.S
index 9075f3d406..9869614a29 100644
--- a/libavcodec/aarch64/vp9lpf_16bpp_neon.S
+++ b/libavcodec/aarch64/vp9lpf_16bpp_neon.S
@@ -22,18 +22,6 @@
 #include "neon.S"
 
 
-.macro transpose_4x8H r0, r1, r2, r3, t4, t5, t6, t7
-        trn1            \t4\().8h,  \r0\().8h,  \r1\().8h
-        trn2            \t5\().8h,  \r0\().8h,  \r1\().8h
-        trn1            \t6\().8h,  \r2\().8h,  \r3\().8h
-        trn2            \t7\().8h,  \r2\().8h,  \r3\().8h
-
-        trn1            \r0\().4s,  \t4\().4s,  \t6\().4s
-        trn2            \r2\().4s,  \t4\().4s,  \t6\().4s
-        trn1            \r1\().4s,  \t5\().4s,  \t7\().4s
-        trn2            \r3\().4s,  \t5\().4s,  \t7\().4s
-.endm
-
 // The input to and output from this macro is in the registers v16-v31,
 // and v0-v7 are used as scratch registers.
 // p7 = v16 .. p3 = v20, p0 = v23, q0 = v24, q3 = v27, q7 = v31