[FFmpeg-cvslog] aarch64: vp8: Optimize put_epel16_h6v6 with vp8_epel8_v6_y2

Martin Storsjö git at videolan.org
Thu Mar 14 21:23:57 EET 2019


ffmpeg | branch: master | Martin Storsjö <martin at martin.st> | Fri Feb  1 09:47:30 2019 +0200| [37394ef01b040605f8e1c98e73aa12b1c0bcba07] | committer: Martin Storsjö

aarch64: vp8: Optimize put_epel16_h6v6 with vp8_epel8_v6_y2

This makes it similar to put_epel16_v6, and gives a large speedup
on Cortex A53, a minor speedup on A72 and a very minor slowdown on
A73.

Before:                 Cortex A53     A72     A73
vp8_put_epel16_h6v6_neon:   2211.4  1586.5  1431.7
After:
vp8_put_epel16_h6v6_neon:   1736.9  1522.0  1448.1

Signed-off-by: Martin Storsjö <martin at martin.st>

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=37394ef01b040605f8e1c98e73aa12b1c0bcba07
---

 libavcodec/aarch64/vp8dsp_neon.S | 34 ++++++++++------------------------
 1 file changed, 10 insertions(+), 24 deletions(-)

diff --git a/libavcodec/aarch64/vp8dsp_neon.S b/libavcodec/aarch64/vp8dsp_neon.S
index 604be8a8bf..139b380fa4 100644
--- a/libavcodec/aarch64/vp8dsp_neon.S
+++ b/libavcodec/aarch64/vp8dsp_neon.S
@@ -769,23 +769,6 @@ endfunc
         sqrshrun2       \d0\().16b, v22.8h, #7
 .endm
 
-.macro  vp8_epel8_v6    d0,  s0,  s1,  s2, s3, s4, s5
-        uxtl            \s2\().8h, \s2\().8b
-        uxtl            \s3\().8h, \s3\().8b
-        uxtl            \s1\().8h, \s1\().8b
-        uxtl            \s4\().8h, \s4\().8b
-        uxtl            \s0\().8h, \s0\().8b
-        uxtl            \s5\().8h, \s5\().8b
-        mul             \s2\().8h, \s2\().8h, v0.h[2]
-        mul             \s3\().8h, \s3\().8h, v0.h[3]
-        mls             \s2\().8h, \s1\().8h, v0.h[1]
-        mls             \s3\().8h, \s4\().8h, v0.h[4]
-        mla             \s2\().8h, \s0\().8h, v0.h[0]
-        mla             \s3\().8h, \s5\().8h, v0.h[5]
-        sqadd           \s3\().8h, \s2\().8h, \s3\().8h
-        sqrshrun        \d0\().8b, \s3\().8h, #7
-.endm
-
 .macro  vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6
         uxtl            \s0\().8h, \s0\().8b
         uxtl            \s3\().8h, \s3\().8b
@@ -942,15 +925,18 @@ function ff_put_vp8_epel16_h6v6_neon, export=1
 2:
         ld1             {v1.8b - v4.8b},    [x7], #32
         ld1             {v16.8b - v19.8b},  [x7], #32
-        ld1             {v20.8b - v23.8b},  [x7]
-        sub             x7,  x7,  #48
+        ld1             {v20.8b - v23.8b},  [x7], #32
+        ld1             {v24.8b - v25.8b},  [x7]
+        sub             x7,  x7,  #64
 
-        vp8_epel8_v6    v5, v1, v3, v16, v18, v20, v22
-        vp8_epel8_v6    v2, v2, v4, v17, v19, v21, v23
-        trn1            v2.2d, v5.2d, v2.2d
+        vp8_epel8_v6_y2 v1, v3, v1, v3, v16, v18, v20, v22, v24
+        vp8_epel8_v6_y2 v2, v4, v2, v4, v17, v19, v21, v23, v25
+        trn1            v1.2d, v1.2d, v2.2d
+        trn1            v3.2d, v3.2d, v4.2d
 
-        st1             {v2.16b}, [x0], x1
-        subs            x4, x4, #1
+        st1             {v1.16b}, [x0], x1
+        st1             {v3.16b}, [x0], x1
+        subs            x4, x4, #2
         b.ne            2b
 
         add             sp,  sp,  #336+16



More information about the ffmpeg-cvslog mailing list